From 3155ce49b8b38c70b2b8054150bffadb880ecd48 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:23:07 -0400
Subject: [PATCH 01/13] new example output table in readme

---
 README.md | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 1ead092..e4993c7 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,13 @@ A LaTeX formatter written in Rust.
 
 ## Formatting style example
 
-Before formatting `example.tex`:
+<table>
+<tr>
+<th>Input file `example.tex`</th>
+<th>After `tex-fmt example.tex`</th>
+</tr>
+<tr>
+<td>
 
 ``` tex
 \documentclass{article}
@@ -30,7 +36,8 @@ E = m c^2
 \end{document}
 ```
 
-After running `tex-fmt example.tex`
+</td>
+<td>
 
 ``` tex
 \documentclass{article}
@@ -49,6 +56,10 @@ After running `tex-fmt example.tex`
 \end{document}
 ```
 
+</td>
+</tr>
+</table>
+
 ## Installation
 
 ### Nix

From 071b297ba4f5b9f43c1b573686b8294e71e166d7 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:26:07 -0400
Subject: [PATCH 02/13] readme table width

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e4993c7..20f292a 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@ A LaTeX formatter written in Rust.
 
 ## Formatting style example
 
-<table>
+<table border="1" width="100%">
 <tr>
-<th>Input file `example.tex`</th>
-<th>After `tex-fmt example.tex`</th>
+<th>Input file <code>example.tex</code></th>
+<th>After <code>tex-fmt example.tex</code></th>
 </tr>
 <tr>
 <td>

From b3a9c0d2ef098db671a96fb345f07299f57afe4f Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:26:41 -0400
Subject: [PATCH 03/13] Update table formatting

---
 README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 20f292a..66e2bcd 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,10 @@ https://mit-license.org/)
 
 A LaTeX formatter written in Rust.
 
-## Formatting style example
-
-<table border="1" width="100%">
+<table width="100%">
 <tr>
-<th>Input file <code>example.tex</code></th>
-<th>After <code>tex-fmt example.tex</code></th>
+<th><code>example.tex</code></th>
+<th><code>tex-fmt example.tex</code></th>
 </tr>
 <tr>
 <td>

From d4424494fef212ed1957ccb18f0dca967a9f13e1 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:25:53 -0400
Subject: [PATCH 04/13] Update README table format

---
 README.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 66e2bcd..f7b19e4 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,18 @@ A LaTeX formatter written in Rust.
 
 <table width="100%">
 <tr>
-<th><code>example.tex</code></th>
-<th><code>tex-fmt example.tex</code></th>
+<th>
+
+```
+example.tex
+```
+</th>
+<th>
+
+```
+tex-fmt example.tex
+```
+</th>
 </tr>
 <tr>
 <td>
@@ -33,7 +43,6 @@ E = m c^2
 
 \end{document}
 ```
-
 </td>
 <td>
 
@@ -53,7 +62,6 @@ E = m c^2
 
 \end{document}
 ```
-
 </td>
 </tr>
 </table>

From fdaa9ad4cc98fc2c7b4fc33b870213d7c6952f53 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:26:56 -0400
Subject: [PATCH 05/13] Fix README table header

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f7b19e4..3fecdb6 100644
--- a/README.md
+++ b/README.md
@@ -11,18 +11,18 @@ A LaTeX formatter written in Rust.
 
 <table width="100%">
 <tr>
-<th>
+<td>
 
-```
+``` shell
 example.tex
 ```
-</th>
-<th>
+</td>
+<td>
 
-```
+``` shell
 tex-fmt example.tex
 ```
-</th>
+</td>
 </tr>
 <tr>
 <td>

From 99314aeda0d8f75fd6505eff07556e38661d5bb6 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:39:22 -0400
Subject: [PATCH 06/13] Renaming test case files

---
 tests/document_in.tex                         |     2 +-
 tests/example1_out.tex                        |    58 -
 tests/example3_in.tex                         | 27572 ----------------
 ...le2_in.tex => masters_dissertation_in.tex} |  2196 +-
 ...2_out.tex => masters_dissertation_out.tex} |     0
 tests/phd_dissertation_in.tex                 | 27572 ++++++++++++++++
 ...mple3_out.tex => phd_dissertation_out.tex} |     0
 tests/{example4_in.tex => readme_in.tex}      |     0
 tests/{example4_out.tex => readme_out.tex}    |     0
 tests/short_document_in.tex                   |    58 +
 ...example1_in.tex => short_document_out.tex} |     0
 11 files changed, 28729 insertions(+), 28729 deletions(-)
 delete mode 100644 tests/example1_out.tex
 delete mode 100644 tests/example3_in.tex
 rename tests/{example2_in.tex => masters_dissertation_in.tex} (53%)
 rename tests/{example2_out.tex => masters_dissertation_out.tex} (100%)
 create mode 100644 tests/phd_dissertation_in.tex
 rename tests/{example3_out.tex => phd_dissertation_out.tex} (100%)
 rename tests/{example4_in.tex => readme_in.tex} (100%)
 rename tests/{example4_out.tex => readme_out.tex} (100%)
 create mode 100644 tests/short_document_in.tex
 rename tests/{example1_in.tex => short_document_out.tex} (100%)

diff --git a/tests/document_in.tex b/tests/document_in.tex
index 5449256..5ec4cc9 100644
--- a/tests/document_in.tex
+++ b/tests/document_in.tex
@@ -2,6 +2,6 @@
 
 \begin{document}
 
-  Documents should not be globally indented.
+Documents should not be globally indented.
 
 \end{document}
diff --git a/tests/example1_out.tex b/tests/example1_out.tex
deleted file mode 100644
index 1f45a5a..0000000
--- a/tests/example1_out.tex
+++ /dev/null
@@ -1,58 +0,0 @@
-\documentclass{article}
-
-\usepackage{amsmath}
-\usepackage{amsthm}
-
-\newtheorem{theorem}{Theorem}
-
-\title{Testing \texttt{texfmt}}
-\author{William G.\ Underwood}
-\begin{document}
-\maketitle
-
-\begin{align}
-  E = m c^2 \\
-  1 + 2
-  + (3 + 4)
-  + (5 + 6
-  + 7 + 8)
-  + (9 + 10
-    + 11 + 12
-  + 13 + 14)
-\end{align}
-
-\begin{itemize}
-  \item Item one % trailing comment with ]) brackets
-  \item Item two on
-    multiple lines
-  \item
-  \item Item three
-    \begin{itemize}
-      \item Subitem one of item two
-      \item Subitem two of item two
-    \end{itemize}
-  \item Item four % trailing comment with [( brackets
-\end{itemize}
-
-\begin{theorem}[Pythagoras]%
-  \label{thm:pythagoras}
-
-  For a right triangle with hypotenuse $c$ and other sides $a$ and $b$,
-  we have
-  %
-  \begin{align*}
-    a^2 + b^2 = c^2
-  \end{align*}
-  %
-  % some comments
-
-\end{theorem}
-
-This line contains \emph{emphasized} text.
-\emph{This line contains only emphasized text,
-and is broken over two lines}.
-\emph{This line contains only
-  emphasized text,
-and is broken over three lines}.
-
-\end{document}
diff --git a/tests/example3_in.tex b/tests/example3_in.tex
deleted file mode 100644
index 77ac71b..0000000
--- a/tests/example3_in.tex
+++ /dev/null
@@ -1,27572 +0,0 @@
-% !TeX program = lualatex
-
-%! TeX root = phd_dissertation.tex
-
-\pdfvariable suppressoptionalinfo 512\relax
-\documentclass[11pt,lof]{puthesis}
-
-% packages
-\usepackage{amsmath}
-\usepackage{amssymb}
-\usepackage[amsmath,thmmarks,noconfig]{ntheorem}
-\usepackage{mathtools}
-\usepackage{multirow}
-\usepackage{pgfplots}
-\usepackage{graphicx}
-\usepackage{enumitem}
-\usepackage{subcaption}
-\usepackage{titlesec}
-\usepackage{stackengine}
-\usepackage{scalerel}
-\usepackage{microtype}
-\usepackage[boxruled,linesnumbered,commentsnumbered,procnumbered]{algorithm2e}
-\usepackage[longnamesfirst]{natbib}
-\usepackage[hypertexnames=false,hidelinks]{hyperref}
-\usepackage[norefs,nocites]{refcheck}
-\usepackage[defaultlines=3,all]{nowidow}
-\usepackage{float}
-
-% settings
-\pgfplotsset{compat=1.9}
-\newcommand{\TODO}[1]{\textcolor{red}{\textsc{TODO}: #1}}
-\setcitestyle{round}
-\captionsetup[subfigure]{justification=centering}
-\def\arraystretch{1.3}
-\renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textit{#1}}
-
-% tables numbered as figures
-\def\table{\def\figurename{Table}\figure}
-\let\endtable\endfigure
-\renewcommand\listfigurename{List of Figures and Tables}
-
-% arxiv
-\newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{\texttt{arXiv:#1}}}
-
-% github
-\newcommand{\github}[1]{\href{https://github.com/#1}{\texttt{github.com/#1}}}
-
-% blackboard
-\renewcommand{\P}{\ensuremath{\mathbb{P}}}
-\newcommand{\N}{\ensuremath{\mathbb{N}}}
-\newcommand{\R}{\ensuremath{\mathbb{R}}}
-\newcommand{\E}{\ensuremath{\mathbb{E}}}
-\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
-\newcommand{\I}{\ensuremath{\mathbb{I}}}
-\newcommand{\Z}{\ensuremath{\mathbb{Z}}}
-
-% roman
-\newcommand{\rF}{\ensuremath{\mathrm{F}}}
-\newcommand{\rH}{\ensuremath{\mathrm{H}}}
-\newcommand{\rL}{\ensuremath{\mathrm{L}}}
-\newcommand{\rk}{\ensuremath{\mathrm{k}}}
-\newcommand{\rd}{\ensuremath{\mathrm{d}}}
-\newcommand{\comp}{\ensuremath{\mathrm{c}}}
-\newcommand{\TV}{\mathrm{TV}}
-
-% bold
-\newcommand{\bW}{\ensuremath{\mathbf{W}}}
-\newcommand{\bY}{\ensuremath{\mathbf{Y}}}
-\newcommand{\bX}{\ensuremath{\mathbf{X}}}
-\newcommand{\bT}{\ensuremath{\mathbf{T}}}
-\newcommand{\bA}{\ensuremath{\mathbf{A}}}
-\newcommand{\bV}{\ensuremath{\mathbf{V}}}
-
-% calligraphic
-\newcommand{\cH}{\ensuremath{\mathcal{H}}}
-\newcommand{\cF}{\ensuremath{\mathcal{F}}}
-\newcommand{\cN}{\ensuremath{\mathcal{N}}}
-\newcommand{\cX}{\ensuremath{\mathcal{X}}}
-\newcommand{\cG}{\ensuremath{\mathcal{G}}}
-\newcommand{\cW}{\ensuremath{\mathcal{W}}}
-\newcommand{\cB}{\ensuremath{\mathcal{B}}}
-\newcommand{\cS}{\ensuremath{\mathcal{S}}}
-\newcommand{\cT}{\ensuremath{\mathcal{T}}}
-\newcommand{\cV}{\ensuremath{\mathcal{V}}}
-\newcommand{\cE}{\ensuremath{\mathcal{E}}}
-\newcommand{\cU}{\ensuremath{\mathcal{U}}}
-\newcommand{\cR}{\ensuremath{\mathcal{R}}}
-\newcommand{\cA}{\ensuremath{\mathcal{A}}}
-\newcommand{\cC}{\ensuremath{\mathcal{C}}}
-\newcommand{\cM}{\ensuremath{\mathcal{M}}}
-\newcommand{\cD}{\ensuremath{\mathcal{D}}}
-\newcommand{\cP}{\ensuremath{\mathcal{P}}}
-\newcommand{\cI}{\ensuremath{\mathcal{I}}}
-\newcommand{\cY}{\ensuremath{\mathcal{Y}}}
-
-% sans serif
-\newcommand{\T}{\ensuremath{\mathsf{T}}}
-
-% symbols
-\newcommand{\vvvert}{{\vert\kern-0.25ex\vert\kern-0.25ex\vert}}
-\newcommand{\bigvvvert}{{\big\vert\kern-0.35ex\big\vert\kern-0.35ex\big\vert}}
-\newcommand{\Bigvvvert}{{\Big\vert\kern-0.3ex\Big\vert\kern-0.3ex\Big\vert}}
-\newcommand{\bigsetminus}{\mathbin{\big\backslash}}
-\newcommand{\Bigsetminus}{\mathbin{\Big\backslash}}
-\newcommand{\dprime}{\ensuremath{\prime\prime}}
-\newcommand{\tprime}{\ensuremath{\prime\prime\prime}}
-\newcommand{\objective}{\ensuremath{\mathrm{obj}}}
-\newcommand{\Dl}{\ensuremath{D_{\textup{lo}}}}
-\newcommand{\Du}{\ensuremath{D_{\textup{up}}}}
-
-% floor of beta
-\newcommand{\flbeta}{{\ThisStyle{%
-      \ensurestackMath{\stackengine{-0.5\LMpt}{\SavedStyle \beta}%
-        {\SavedStyle {\rule{3.7\LMpt}{0.3\LMpt}}}
-{U}{c}{F}{F}{S}}\vphantom{\beta}}}}
-
-% operators
-\DeclareMathOperator{\Var}{Var}
-\DeclareMathOperator{\Cov}{Cov}
-\DeclareMathOperator{\AIMSE}{AIMSE}
-\DeclareMathOperator{\LOOCV}{LOOCV}
-\DeclareMathOperator{\symconv}{symconv}
-\DeclareMathOperator{\GCV}{GCV}
-\DeclareMathOperator{\Unif}{Unif}
-\DeclareMathOperator*{\logistic}{logistic}
-\DeclareMathOperator{\Bias}{Bias}
-\DeclareMathOperator{\Env}{Env}
-\DeclareMathOperator*{\esssup}{ess\,sup}
-\DeclareMathOperator{\Ber}{Ber}
-\DeclareMathOperator{\KL}{KL}
-\DeclareMathOperator{\Gam}{Gam}
-\DeclareMathOperator{\Yule}{Yule}
-\DeclareMathOperator{\rank}{rank}
-\DeclareMathOperator{\Exp}{Exp}
-\DeclareMathOperator{\Bin}{Bin}
-\DeclareMathOperator{\Tr}{Tr}
-\DeclareMathOperator{\Leb}{Leb}
-\DeclareMathOperator*{\argmin}{arg\,min}
-\DeclareMathOperator*{\minimize}{minimize:}
-\DeclareMathOperator*{\subjectto}{subject\ to:}
-\DeclareMathOperator{\ROT}{ROT}
-\newcommand{\diff}[1]{\,\mathrm{d}#1}
-
-% theorem environments
-\renewtheoremstyle{break}{%
-\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
-  ##2}\hbox{\strut}}}]%
-}{%
-\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
-  ##2\ \normalfont (##3)}\hbox{\strut}}}]%
-}
-\theoremstyle{break}
-\theorempreskip{7mm}
-\newtheorem{theorem}{Theorem}[section]
-\newtheorem{lemma}{Lemma}[section]
-\newtheorem{assumption}{Assumption}[section]
-\newtheorem{corollary}{Corollary}[section]
-\newtheorem{proposition}{Proposition}[section]
-\newtheorem{definition}{Definition}[section]
-\newtheorem{remark}{Remark}[section]
-
-% proof environments
-\let\proof\relax
-\newtheoremstyle{proof}{%
-\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
-  }\hbox{\strut}}}]%
-}{%
-\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
-  \normalfont (##3)}\hbox{\strut}}}]%
-}
-\theoremstyle{proof}
-\theorembodyfont{\upshape}
-\theorempreskip{7mm}
-\theoremsymbol{\ensuremath{\square}}
-\newtheorem{proof}{Proof}
-\AtBeginEnvironment{proof}{\setcounter{proofparagraphcounter}{0}}%
-
-% proof paragraphs
-\titleformat{\paragraph}[hang]{\bfseries\upshape}{}{0pt}{}[]
-\titlespacing*{\paragraph}{0pt}{6pt}{0pt}
-\newcounter{proofparagraphcounter}
-\newcommand{\proofparagraph}[1]{
-  \refstepcounter{proofparagraphcounter}%
-\paragraph{Part \theproofparagraphcounter : #1}}%
-
-% inline roman lists
-\newlist{inlineroman}{enumerate*}{1}
-\setlist[inlineroman]{afterlabel=~,label=(\roman*)}
-
-% algorithms
-\DontPrintSemicolon%
-\makeatletter%
-\renewcommand{\SetKwInOut}[2]{%
-  \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}%
-  \expandafter\ifx\csname InOutSizeDefined\endcsname\relax%
-  \newcommand\InOutSizeDefined{}%
-  \setlength{\inoutsize}{\wd\algocf@inoutbox}%
-  \sbox\algocf@inoutbox{%
-    \parbox[t]{\inoutsize}%
-    {\KwSty{#2}\algocf@typo:\hfill}~%
-  }%
-  \setlength{\inoutindent}{\wd\algocf@inoutbox}%
-  \else%
-  \ifdim\wd\algocf@inoutbox>\inoutsize%
-  \setlength{\inoutsize}{\wd\algocf@inoutbox}%
-  \sbox\algocf@inoutbox{%
-    \parbox[t]{\inoutsize}%
-    {\KwSty{#2}\algocf@typo:\hfill}~%
-  }%
-  \setlength{\inoutindent}{\wd\algocf@inoutbox}%
-  \fi%
-  \fi%
-  \algocf@newcommand{#1}[1]{%
-    \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}{%
-      \let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]%
-      {\inoutsize}{\KwSty{#2}%
-      \algocf@typo:\hfill}~##1\par%
-    }%
-    \algocf@linesnumbered%
-  }%
-}%
-\makeatother%
-\SetKwInOut{Input}{Input}%
-\SetKwInOut{Output}{Output}%
-\setlength{\algomargin}{2em}%
-
-\author{William George Underwood}
-\adviser{Matias Damian Cattaneo}
-\title{Estimation and Inference in \\ Modern Nonparametric Statistics}
-
-\abstract{
-
-% 350 words max
-
-Nonparametric methods are central to modern statistics, enabling data analysis
-with minimal assumptions in a wide range of scenarios. While contemporary
-procedures such as random forests and kernel methods are popular due to their
-performance and flexibility, their statistical properties are often less well
-understood. The availability of sound inferential techniques is vital in the
-sciences, allowing researchers to quantify uncertainty in their models. We
-develop methodology for robust and practical statistical estimation and
-inference in some modern nonparametric settings involving complex estimators
-and nontraditional data.
-
-We begin in the regression setting by studying the Mondrian random forest, a
-variant in which the partitions are drawn from a Mondrian process. We present a
-comprehensive analysis of the statistical properties of Mondrian random
-forests, including a central limit theorem for the estimated regression
-function and a characterization of the bias. We show how to conduct feasible
-and valid nonparametric inference by constructing confidence intervals, and
-further provide a debiasing procedure that enables minimax-optimal estimation
-rates for smooth function classes in arbitrary dimension.
-
-Next, we turn our attention to nonparametric kernel density estimation with
-dependent dyadic network data. We present results for minimax-optimal
-estimation, including a novel lower bound for the dyadic uniform convergence
-rate, and develop methodology for uniform inference via confidence bands and
-counterfactual analysis. Our methods are based on strong approximations and are
-designed to be adaptive to potential dyadic degeneracy. We give empirical
-results with simulated and real-world economic trade data.
-
-Finally, we develop some new probabilistic results with applications to
-nonparametric statistics. Coupling has become a popular approach for
-distributional analysis in recent years, and Yurinskii's method stands out for
-its wide applicability and explicit formulation. We present a generalization of
-Yurinskii's coupling, treating approximate martingale data under weaker
-conditions than previously imposed. We allow for Gaussian mixture coupling
-distributions, and a third-order method permits faster rates in certain
-situations. We showcase our results with applications to factor models and
-martingale empirical processes, as well as nonparametric partitioning-based and
-local polynomial regression procedures.
- }
-\acknowledgments{
-
-I am extremely fortunate to have been surrounded by many truly wonderful people
-over the course of my career, and without their support this dissertation would
-not have been possible. While it is impossible for me to identify every one of
-them individually, I would like to mention a few names in particular to
-recognize those who have been especially important to me during the last few
-years.
-
-Firstly, I would like to express my utmost gratitude to my Ph.D.\ adviser,
-Matias Cattaneo. Working with Matias has been genuinely inspirational for me,
-and I could not have asked for a more rewarding start to my journey as a
-researcher. From the very beginning, he has guided me expertly through my
-studies, providing hands-on assistance when required while also allowing me the
-independence necessary to develop as an academic. I hope that, during the four
-years we have worked together, I have acquired just a fraction of his formidable
-mathematical intuition, keen attention to detail, boundless creativity, and
-inimitable pedagogical skill. Alongside his role as my adviser, Matias has been
-above all a friend, who has been in equal measure inspiring,
-insightful, dedicated, understanding, and kind.
-
-Secondly, I would like to thank all of the faculty members at Princeton and
-beyond who have acted as my collaborators and mentors, without whom none of my
-work could have been realized. In particular, I express my gratitude to my
-tireless Ph.D.\ committee members and letter writers Jianqing Fan and Jason
-Klusowski, my coauthors Yingjie Feng and Ricardo Masini, my dissertation reader
-Boris Hanin, my teachers
-Amir Ali Ahmadi, Ramon van Handel, Mikl{\'o}s R{\'a}cz, and Mykhaylo Shkolnikov,
-my colleagues Sanjeev Kulkarni and Roc{\'i}o Titiunik,
-and my former supervisor Mihai Cucuringu.
-I am also thankful for the staff members at Princeton who have been
-perpetually helpful, and I would like to identify Kim
-Lupinacci in particular; her assistance in all things administrative has been
-invaluable.
-
-I am grateful to my fellow graduate students in the ORFE department for their
-technical expertise and generosity with their time, and for making Sherrerd
-Hall such a vibrant and exciting space, especially
-Jose Avilez,
-Pier Beneventano,
-Ben Budway,
-Rajita Chandak,
-Abraar Chaudhry,
-Stefan Clarke,
-Giulia Crippa,
-G{\"o}k{\c c}e Dayan{\i}kl{\i},
-Nicolas Garcia,
-Felix Hoefer,
-Erica Lai,
-Jackie Lok,
-Maya Mutic,
-Dan Rigobon,
-Till Saenger,
-Rajiv Sambharya,
-Boris Shigida,
-Igor Silin,
-Giang Truong,
-and Rae Yu.
-Our
-regular social events made a contribution to my well-being which is difficult
-to overstate. My thanks extend also to the students I taught, as
-well as to my group of senior thesis undergraduates, for their commitment,
-patience, and responsiveness.
-
-More broadly, I would like to thank all of my friends, near and far,
-for their unfailing support and reliability, and for
-helping to create so many of my treasured memories. In particular,
-Ole Agersnap,
-James Ashford,
-Christian Baehr,
-Chris Bambic,
-Kevin Beeson,
-James Broadhead,
-Alex Cox,
-Reece Edmends,
-Robin Franklin,
-Greg Henderson,
-Bonnie Ko,
-Grace Matthews,
-Dan Mead,
-Ben Musachio,
-Jacob Neis,
-Monika Papayova,
-Will Pedrick,
-Oliver Philcox,
-Nandita Rao,
-Alex Rice,
-Edward Rowe,
-David Snyder,
-Titi Sodimu,
-Nikitas Tampakis,
-and Anita Zhang.
-Thank you to the Princeton Chapel Choir for being such a wonderful
-community of musicians and a source of close friends,
-and to our directors, Nicole Aldrich and Penna Rose, and organist Eric Plutz.
-
-Lastly, yet most importantly, I want to thank my family for their unwavering
-support throughout my studies. My visits back home have been a source of joy
-throughout my long and often challenging Ph.D., and I cherish every moment I
-have spent with my parents, sister, grandparents, and extended family.
- }
-
-\begin{document}
-
-
-\chapter{Introduction}
-
-% nonparametric estimation is common
-Nonparametric estimation procedures are at the heart of many contemporary
-theoretical and methodological topics within the fields of statistics, data
-science, and machine learning. Where classical parametric techniques impose
-specific distributional and structural assumptions when modeling statistical
-problems, nonparametric methods instead take a more flexible approach,
-typically positing only high-level restrictions such as moment conditions,
-independence criteria, and smoothness assumptions. Examples of such procedures
-abound in modern data science and machine learning, encompassing histograms,
-kernel estimators, smoothing splines, decision trees, nearest neighbor methods,
-random forests, neural networks, and many more.
-
-% nonparametric estimation is good
-The benefits of the nonparametric framework are clear: statistical procedures
-can be formulated in cases where the stringent assumptions of parametric models
-are untestable, demonstrably violated, or simply unreasonable.
-As a consequence,
-the resulting methods often inherit desirable robustness properties against
-various forms of misspecification or misuse. The class of problems that can be
-formulated is correspondingly larger: arbitrary distributions and
-relationships can be characterized and estimated in a principled manner.
-
-% nonparametric estimation is hard
-Nonetheless, these attractive properties do come at a price. In particular, as
-its name suggests, the nonparametric approach forgoes the ability to reduce
-a complex statistical problem to that of estimating a fixed, finite number of
-parameters. Rather, nonparametric procedures typically involve making inferences
-about a growing number of parameters simultaneously, as witnessed in
-high-dimensional regimes, or even directly handling infinite-dimensional
-objects such as entire regression or density functions. As a consequence,
-nonparametric estimators are usually less efficient than their
-correctly specified parametric counterparts, when they are available; rates of
-convergence tend to be slower, and confidence sets more conservative. Another
-challenge is that theoretical mathematical analyses of nonparametric estimators
-are often significantly more demanding than those required for low-dimensional
-parametric settings, necessitating tools from contemporary developments in
-high-dimensional concentration phenomena, coupling and strong approximation
-theory, empirical processes, mathematical optimization, and stochastic
-calculus.
-
-% nonparametric inference
-In addition to providing accurate point estimates of unknown (possibly
-high-dimensional or infinite-dimensional) quantities of interest, modern
-nonparametric procedures are also expected to come equipped with methodologies
-for conducting statistical inference. The availability of such inferential
-techniques is paramount, with contemporary nonparametric methods forming a
-ubiquitous component of modern data science tool kits. Valid uncertainty
-quantification is essential for hypothesis testing, error bar construction,
-assessing statistical significance, and performing power analyses. Inference is
-a central concept in classical statistics, and despite the rapid
-recent development of theory for modern nonparametric estimators, their
-applicability to statistical inference is in certain cases rather less well
-studied; theoretically sound and practically implementable inference procedures
-are sometimes absent in the literature.
-
-% complex data
-In any statistical modeling problem, the selection and application of an
-estimator must naturally be tailored to the available data. Today, much of the
-data produced and analyzed does not necessarily fit neatly into the classical
-framework of independent and identically distributed samples, and instead might
-consist of time series, stochastic processes, networks,
-or high-dimensional or functional data, to name just a few.
-Therefore, it is important to understand how nonparametric methods might be
-adapted to correctly handle these data types, maintaining fast estimation rates
-and valid techniques for statistical inference. The technical challenges
-associated with such an endeavor are non-trivial; many standard techniques are
-ineffective in the presence of dependent or infinite-dimensional data, for
-example. As such, the development of new mathematical results in probability
-theory plays an important role in the comprehensive treatment of nonparametric
-statistics with complex data.
-
-\section*{Overview of the dissertation}
-
-% what we do
-This dissertation presents a selection of topics relating to nonparametric
-estimation and inference, and the associated technical mathematical tools.
-
-% mondrian
-Chapter~\ref{ch:mondrian}, titled ``Inference with Mondrian Random Forests,''
-is based on the work of \citet{cattaneo2023inference}.
-% what are random forests
-Random forests are popular ensembling-based methods for classification and
-regression, which are well known for their good performance, flexibility,
-robustness, and efficiency. The majority of random forest models share the
-following common framework for producing estimates of a classification or
-regression function using covariates and a response variable. Firstly, the
-covariate space is partitioned in some algorithmic manner, possibly using a
-source of external randomness. Secondly, a local estimator of the
-classification or regression function is fitted to the responses in each cell
-separately, yielding a tree estimator. Finally, this process is repeated with
-many different partitions, and the resulting tree estimators are averaged to
-produce a random forest.
-
-% why are there variants
-Many different variants of random forests have been proposed in recent years,
-typically with the aim of improving their statistical or computational
-properties, or simplifying their construction in order to permit a more
-detailed theoretical analysis.
-% mondrian random forests
-One interesting such example is that of the Mondrian random forest, in which
-the underlying partitions (or trees) are constructed independently of the data.
-Naturally, this restriction rules out many classical random forest models, which
-exhibit a complex and data-dependent partitioning scheme. Instead, trees are
-sampled from a canonical stochastic process known as the Mondrian process,
-which endows the resulting tree and forest estimators with various agreeable
-features.
-
-% what we do
-We study the estimation and inference properties of Mondrian
-random forests in the nonparametric regression setting. In particular, we
-establish a novel central limit theorem for the estimates made by a Mondrian
-random forest which, when combined with a characterization of the bias and a
-consistent variance estimator, allows one to perform asymptotically valid
-statistical inference, such as constructing confidence intervals, on the
-unknown regression function. We also provide a debiasing procedure for Mondrian
-random forests, which allows them to achieve minimax-optimal estimation rates
-with H{\"o}lder smooth regression functions, for any smoothness parameter and
-in arbitrary dimension.
-
-% kernel
-Chapter~\ref{ch:kernel}, titled ``Dyadic Kernel Density Estimators,'' is based
-on the work of \citet{cattaneo2024uniform}. Network data plays an important role
-in statistics, econometrics, and many other data science disciplines, providing
-a natural framework for modeling relationships between units, be they people,
-financial institutions, proteins, or economic entities. Of prominent interest
-is the task of performing statistical estimation and inference with data
-sampled from the edges of such networks, known as dyadic data. The archetypal
-lack of independence between edges in a network renders many classical
-statistical tools unsuited for direct application. As such, researchers must
-appeal to techniques tailored to dyadic data in order to accurately capture the
-complex structure present in the network.
-
-% broad scope
-We focus on nonparametric estimation and inference with dyadic
-data, and in particular we seek methods that are robust in the sense that our
-results should hold uniformly across the support of the data. Such uniformity
-guarantees allow for statistical inference in a broader range of settings,
-including specification testing and distributional counterfactual analysis. We
-specifically consider the problem of uniformly estimating a dyadic
-density function, focusing on kernel estimators taking the form of dyadic
-empirical processes.
-
-% main contributions
-Our main contributions include the minimax-optimal uniform convergence rate of
-the dyadic kernel density estimator, along with strong approximation results
-for the associated standardized and Studentized $t$-processes. A consistent
-variance estimator enables the construction of feasible uniform
-confidence bands for the unknown density function. We showcase the broad
-applicability of our results by developing novel counterfactual density
-estimation and inference methodology for dyadic data, which can be used for
-causal inference and program evaluation.
-% why it is difficult
-A crucial feature of dyadic distributions is that they may be ``degenerate'' at
-certain points in the support of the data, a property that makes our analysis
-somewhat delicate. Nonetheless, our methods for uniform inference remain robust
-to the potential presence of such points.
-% applications
-For implementation purposes, we discuss inference procedures based on positive
-semi-definite covariance estimators, mean squared error optimal bandwidth
-selectors, and robust bias correction. We illustrate the empirical
-performance of our methods in simulations and with
-real-world trade data, for which we make comparisons between observed and
-counterfactual trade distributions in different years. Our technical results
-on strong approximations and maximal inequalities are of potential
-independent interest.
-
-% yurinskii
-Finally, Chapter~\ref{ch:yurinskii}, titled ``Yurinskii's Coupling for
-Martingales,'' is based on the work of \citet{cattaneo2022yurinskii}.
-Yurinskii's coupling is a popular theoretical tool for non-asymptotic
-distributional analysis in mathematical statistics and applied probability.
-Coupling theory, also known as strong approximation, provides an alternative
-framework to the more classical weak convergence approach to statistical
-analysis. Rather than merely approximating the distribution of a random
-variable, strong approximation techniques construct a sequence of random
-variables which are close almost surely or in probability, often with
-finite-sample guarantees.
-
-% what is it used for
-Coupling allows distributional analysis in settings where weak convergence
-fails, including many applications to nonparametric or high-dimensional
-statistics; it is a key technical component in the main strong approximation
-results of our Chapter~\ref{ch:kernel}. The Yurinskii method specifically
-offers a Gaussian coupling with an explicit error bound under easily verified
-conditions; originally stated in $\ell^2$-norm for sums of independent random
-vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p
-\leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some
-strong conditions.
-
-% what we do
-We present as our main result a Yurinskii coupling for approximate martingales
-in $\ell^p$-norm, under substantially weaker conditions than previously
-imposed. Our formulation allows the coupling variable to follow a
-general Gaussian mixture distribution, and we provide a novel third-order
-coupling method which gives tighter approximations in certain situations. We
-specialize our main result to mixingales, martingales, and independent data,
-and derive uniform Gaussian mixture strong approximations for martingale
-empirical processes. Applications to nonparametric partitioning-based and local
-polynomial regression procedures are provided.
-
-% appendices
-Supplementary materials for Chapters~\ref{ch:mondrian}, \ref{ch:kernel}, and
-\ref{ch:yurinskii} are provided in Appendices~\ref{app:mondrian},
-\ref{app:kernel}, and \ref{app:yurinskii} respectively. These contain detailed
-proofs of the main results, additional technical contributions, and further
-discussion.
-
-\chapter[Inference with Mondrian Random Forests]%
-{Inference with \\ Mondrian Random Forests}
-\label{ch:mondrian}
-
-% abstract
-Random forests are popular methods for classification and regression, and many
-different variants have been proposed in recent years. One interesting example
-is the Mondrian random forest, in which the underlying trees are constructed
-according to a Mondrian process. In this chapter we give a central limit theorem
-for the estimates made by a Mondrian random forest in the regression setting.
-When combined with a bias characterization and a consistent variance estimator,
-this allows one to perform asymptotically valid statistical inference, such as
-constructing confidence intervals, on the unknown regression function. We also
-provide a debiasing procedure for Mondrian random forests which allows them to
-achieve minimax-optimal estimation rates with $\beta$-H{\"o}lder regression
-functions, for all $\beta$ and in arbitrary dimension, assuming appropriate
-parameter tuning.
-
-\section{Introduction}
-
-Random forests, first introduced by \citet{breiman2001random}, are a workhorse
-in modern machine learning for classification and regression tasks.
-Their desirable traits include computational efficiency (via parallelization
-and greedy heuristics) in big data settings, simplicity of configuration and
-amenability to tuning parameter selection, ability to adapt to latent structure
-in high-dimensional data sets, and flexibility in handling mixed data types.
-Random forests have achieved great empirical successes in many fields of study,
-including healthcare, finance, online commerce, text analysis, bioinformatics,
-image classification, and ecology.
-
-Since Breiman introduced random forests over twenty years ago, the study of
-their statistical properties remains an active area of research: see
-\citet{scornet2015consistency}, \citet{chi2022asymptotic},
-\citet{klusowski2024large}, and references therein, for a sample of recent
-developments. Many fundamental questions about Breiman's random forests remain
-unanswered, owing in part to the subtle ingredients present in the estimation
-procedure which make standard analytical tools ineffective. These technical
-difficulties stem from the way the constituent trees greedily partition the
-covariate space, utilizing both the covariate and response data. This creates
-complicated dependencies on the data which are often exceedingly hard to
-untangle without overly stringent assumptions, thereby hampering theoretical
-progress.
-
-To address the aforementioned technical challenges while retaining the
-phenomenology of Breiman's random forests, a variety of stylized versions of
-random forest procedures have been proposed and studied in the literature.
-These include centered random forests
-\citep{biau2012analysis,arnould2023interpolation} and median random forests
-\citep{duroux2018impact,arnould2023interpolation}. Each tree in a centered
-random forest is constructed by first choosing a covariate uniformly at random
-and then splitting the cell at the midpoint along the direction of the chosen
-covariate. Median random forests operate in a similar way, but involve the
-covariate data by splitting at the empirical median along the direction of the
-randomly chosen covariate. Known as purely random forests, these procedures
-simplify Breiman's original---albeit more data-adaptive---version by growing
-trees that partition the covariate space in a way that is statistically
-independent of the response data.
-
-Yet another variant of random forests, Mondrian random forests
-\citep{lakshminarayanan2014mondrian}, have received significant attention in
-the statistics and machine learning communities in recent years
-\citep{ma2020isolation, mourtada2020minimax, scillitoe2021uncertainty,
-mourtada2021amf, vicuna2021reducing, gao2022towards, oreilly2022stochastic}.
-Like
-other purely random forest variants, Mondrian random forests offer a simplified
-modification of Breiman's original proposal in which the partition is generated
-independently of the data and according to a canonical stochastic process known
-as the Mondrian process \citep{roy2008mondrian}. The Mondrian process takes a
-single parameter $\lambda > 0$ known as the ``lifetime'' and enjoys various
-mathematical properties. These probabilistic
-features allow Mondrian random forests to be
-fitted in an online manner as well as being subject to a rigorous statistical
-analysis, while also retaining some of the appealing features of other
-more traditional random forest methods.
-
-This chapter studies the statistical properties of Mondrian random forests. We
-focus on this purely random forest variant not only because of its importance
-in the development of random forest theory in general, but also because the
-Mondrian process is, to date, the only known recursive tree mechanism involving
-randomization, pure or data-dependent, for which the resulting random forest is
-minimax-optimal for point estimation over a class of smooth regression
-functions in arbitrary dimension \citep{mourtada2020minimax}. In fact, when the
-covariate dimension exceeds one, the aforementioned centered and median random
-forests are both minimax-\emph{suboptimal}, due to their large biases, over the
-class of Lipschitz smooth regression functions \citep{klusowski2021sharp}. It
-is therefore natural to focus our study of inference for random forests on
-versions that at the very least exhibit competitive bias and variance, as this
-will have important implications for the trade-off between precision and
-confidence.
-
-Despite their recent popularity, relatively little is known about the formal
-statistical properties of Mondrian random forests. Focusing on nonparametric
-regression, \citet{mourtada2020minimax} recently showed that Mondrian forests
-containing just a single tree (called a Mondrian tree) can be minimax-optimal
-in integrated mean squared error whenever the regression function is
-$\beta$-H{\"o}lder continuous for some $\beta \in (0, 1]$. The authors also
-showed that, when appropriately tuned, large Mondrian random forests can be
-similarly minimax-optimal for $\beta \in (0, 2]$, while the constituent trees
-cannot. See also \citet{oreilly2022stochastic} for analogous results for more
-general
-Mondrian tree and forest constructions. These results formally demonstrate the
-value of ensembling with random forests from a point estimation perspective. No
-results are currently available in the literature for statistical inference
-using Mondrian random forests.
-
-This chapter contributes to the literature on the foundational statistical
-properties of Mondrian random forest regression estimation with two main
-results. Firstly, we give a central limit theorem for the classical Mondrian
-random forest point estimator, and propose valid large-sample inference
-procedures employing a consistent standard error estimator. We establish this
-result by deploying a martingale central limit theorem
-\citep[Theorem~3.2]{hall1980martingale} because we need to handle delicate
-probabilistic features of the Mondrian random forest estimator. In particular,
-we deal with the existence of Mondrian cells which are ``too small'' and lead
-to a reduced effective (local) sample size for some trees in the forest. Such
-pathological cells are in fact typical in Mondrian random forests and
-complicate the probability limits of certain sample averages; in fact, small
-Mondrian random forests (or indeed single Mondrian trees) remain random even
-in the limit due to the lack of ensembling. The presence of small cells
-renders inapplicable prior distributional approximation results for
-partitioning-based estimators in the literature
-\citep{huang2003local,cattaneo2020large}, since the commonly required
-quasi-uniformity assumption on the underlying partitioning scheme is violated
-by cells generated using the Mondrian process. We circumvent this
-technical challenge by establishing new theoretical results for Mondrian
-partitions and their associated Mondrian trees and forests, which may be of
-independent interest.
-
-The second main contribution of the chapter is to propose a debiasing approach
-for the Mondrian random forest point estimator. We accomplish this by first
-precisely characterizing the probability limit of the large sample conditional
-bias, and then applying a debiasing procedure based on the generalized
-jackknife \citep{schucany1977improvement}. We thus exhibit a Mondrian random
-forest variant which is minimax-optimal in pointwise mean squared error when
-the regression function is $\beta$-H{\"o}lder for any $\beta > 0$. Our method
-works by generating an ensemble of Mondrian random forests carefully chosen to
-have smaller misspecification bias when extra smoothness is available,
-resulting in minimax optimality even for $\beta > 2$. This result complements
-\citet{mourtada2020minimax} by demonstrating the existence of a class of
-Mondrian random forests that can efficiently exploit the additional smoothness
-of the unknown regression function for minimax-optimal point estimation. Our
-proposed debiasing procedure is also useful when conducting statistical
-inference because it provides a principled method for ensuring that the bias is
-negligible relative to the standard deviation of the estimator. More
-specifically, we use our debiasing approach to construct valid inference
-procedures based on robust bias correction
-\citep{calonico2018effect,calonico2022coverage}.
-
-This chapter is structured as follows. In Section~\ref{sec:mondrian_setup} we
-introduce the Mondrian process and give our assumptions on the data generating
-process, using a H{\"o}lder smoothness condition on the regression function to
-control the bias of various estimators. We define the Mondrian random forest
-estimator and present our assumptions on its lifetime parameter and the number
-of trees. We give our notation for the following sections in this chapter.
-
-Section~\ref{sec:mondrian_inference} presents our first set of main results,
-beginning with a central limit theorem for the centered Mondrian random forest
-estimator (Theorem~\ref{thm:mondrian_clt}), in which we characterize the
-limiting
-variance. Theorem~\ref{thm:mondrian_bias} complements this result by precisely
-calculating the limiting bias of the estimator, with the aim of subsequently
-applying a debiasing procedure. To enable valid feasible statistical inference,
-we provide a consistent variance estimator in
-Theorem~\ref{thm:mondrian_variance_estimation} and briefly discuss implications
-for
-lifetime parameter selection.
-
-In Section~\ref{sec:mondrian_overview_proofs} we provide a brief overview of
-the proofs
-of these first main results. We focus on the technical innovations and general
-strategic approach, giving some insight into the challenges involved, and refer
-the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs.
-
-In Section~\ref{sec:mondrian_debiased} we define debiased Mondrian random
-forests, a
-collection of estimators based on linear combinations of Mondrian random
-forests with varying lifetime parameters. These parameters are carefully chosen
-to annihilate leading terms in our bias characterization, yielding an estimator
-with provably superior bias properties
-(Theorem~\ref{thm:mondrian_bias_debiased}). In
-Theorem~\ref{thm:mondrian_clt_debiased}
-we verify that a central limit theorem continues to hold for the debiased
-Mondrian random forest. We again state the limiting variance, discuss the
-implications for the lifetime parameter, and provide a consistent variance
-estimator (Theorem~\ref{thm:mondrian_variance_estimation_debiased}) for
-constructing
-confidence intervals (Theorem~\ref{thm:mondrian_confidence_debiased}). As a
-final
-corollary of the improved bias properties, we demonstrate in
-Theorem~\ref{thm:mondrian_minimax} that the debiased Mondrian random forest
-estimator is minimax-optimal in pointwise mean squared error for all
-$\beta > 0$, provided that $\beta$ is known a priori.
-
-Section~\ref{sec:mondrian_parameter_selection} discusses tuning parameter
-selection,
-beginning with a data-driven approach to selecting the crucial lifetime
-parameter using polynomial estimation, alongside other practical suggestions
-including generalized cross-validation.
-We also give advice on choosing the number of trees, and other parameters
-associated with the debiasing procedure.
-
-In Section~\ref{sec:mondrian_weather} we present an illustrative example
-application of our proposed methodology for estimation and inference in the
-setting of weather forecasting in Australia. We demonstrate the use of
-our debiased Mondrian random forest estimator and our
-generalized cross-validation procedure for lifetime parameter selection,
-as well as the construction of point estimates and confidence intervals.
-
-Concluding remarks are given in Section~\ref{sec:mondrian_conclusion}, while
-Appendix~\ref{app:mondrian} contains all the mathematical proofs of our
-theoretical contributions, along with some other technical
-probabilistic results on the Mondrian process which may be of interest.
-
-\subsection{Notation}
-
-We write $\|\cdot\|_2$ for the usual Euclidean $\ell^2$-norm on $\R^d$. The
-natural numbers are $\N = \{0, 1, 2, \ldots \}$. We use $a \wedge b$ for the
-minimum and $a \vee b$ for the maximum of two real numbers. For a set $A$, we
-use $A^{\comp}$ for the complement whenever the background space is clear from
-context. We use $C$ to denote a positive constant whose value may change from
-line to line. For non-negative sequences $a_n$ and $b_n$, write
-$a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded
-for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If
-$a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random
-non-negative sequences $A_n$ and $B_n$, similarly write $A_n \lesssim_\P B_n$
-or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability,
-and $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. Convergence of
-random variables $X_n$ in distribution to a law $\P$ is denoted by
-$X_n \rightsquigarrow \P$.
-
-\section{Setup}
-\label{sec:mondrian_setup}
-
-When using a Mondrian random forest, there are two sources of randomness. The
-first is of course the data, and here we consider the nonparametric regression
-setting with $d$-dimensional covariates. The second source is a collection of
-independent trees drawn from a Mondrian process, which we define in the
-subsequent section, using a specified lifetime parameter.
-
-\subsection{The Mondrian process}
-\label{sec:mondrian_process}
-
-The Mondrian process was introduced by \citet{roy2008mondrian} and offers a
-canonical method for generating random rectangular partitions, which can be
-used as the trees for a random forest
-\citep{lakshminarayanan2014mondrian,lakshminarayanan2016mondrian}. For
-the reader's convenience, we give a brief description of this process here; see
-\citet[Section~3]{mourtada2020minimax} for a more complete definition.
-
-For a fixed dimension $d$ and lifetime parameter $\lambda > 0$, the Mondrian
-process is a stochastic process taking values in the set of finite rectangular
-partitions of $[0,1]^d$. For a rectangle
-$D = \prod_{j=1}^d [a_j, b_j] \subseteq [0,1]^d$,
-we denote the side aligned with dimension $j$ by $D_j = [a_j, b_j]$, write
-$D_j^- = a_j$ and $D_j^+ = b_j$ for its left and right endpoints respectively,
-and use $|D_j| = D_j^+ - D_j^-$ for its length. The volume of $D$ is
-$|D| = \prod_{j=1}^{d} |D_j|$ and its linear dimension (or half-perimeter) is
-$|D|_1 = \sum_{j=1}^{d} |D_j|$.
-
-To sample a partition $T$ from the Mondrian process
-$\cM \big( [0,1]^d, \lambda \big)$ we start at time $t=0$ with the trivial
-partition of $[0,1]^d$ which has no splits. We then repeatedly apply the
-following procedure to each cell $D$ in the partition. Let $t_D$ be the time at
-which the cell was formed, and sample $E_D \sim \Exp \left( |D|_1 \right)$. If
-$t_D + E_D \leq \lambda$, then we split $D$. This is done by first selecting a
-split dimension $J$ with $\P(J=j) = |D_j| / |D|_1$, and then sampling a split
-location $S_J \sim \Unif\big[D_J^-, D_J^+\big]$. The cell $D$ splits into the
-two new cells $\{x \in D : x_J \leq S_J\}$ and $\{x \in D : x_J > S_J\}$, each
-with formation time $t_D + E_D$. The final outcome is the partition $T$
-consisting of the cells $D$ which were not split because $t_D + E_D > \lambda$.
-The cell in $T$ containing a point $x \in [0,1]^d$ is written $T(x)$.
-Figure~\ref{fig:mondrian_process} shows typical realizations of
-$T \sim \cM\big( [0,1]^d, \lambda \big)$ for $d=2$ and with different lifetime
-parameters $\lambda$.
-%
-\begin{figure}[t]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/plot_mondrian_process_1.pdf}
-    \caption{$\lambda = 3$}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/plot_mondrian_process_2.pdf}
-    \caption{$\lambda = 10$}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/plot_mondrian_process_3.pdf}
-    \caption{$\lambda = 30$}
-  \end{subfigure}
-  %
-  \caption[The Mondrian process]{
-    The Mondrian process $T \sim \cM \big( [0,1]^d, \lambda \big)$ with
-  $d=2$ and lifetime parameters $\lambda$.}
-  \label{fig:mondrian_process}
-\end{figure}
-
-\subsection{Data generation}
-
-Throughout this chapter, we assume that the data satisfies
-Assumption~\ref{ass:mondrian_data}. We begin with a definition of H{\"o}lder
-continuity which will be used for controlling the bias of various estimators.
-
-\begin{definition}[H{\"o}lder continuity]%
-
-  Take $\beta > 0$ and define $\flbeta$ to be the largest integer which is
-  strictly less than $\beta$. We say a function $g: [0,1]^d \to \R$ is
-  $\beta$-H{\"o}lder continuous and write $g \in \cH^\beta$ if $g$ is $\flbeta$
-  times differentiable and
-  $\max_{|\nu| = \flbeta}
-  \left| \partial^\nu g(x) - \partial^{\nu} g(x') \right|
-  \leq C \|x-x'\|_2^{\beta - \flbeta}$
-  for some constant $C > 0$ and all $x, x' \in [0,1]^d$. Here, $\nu \in \N^d$
-  is a multi-index with $|\nu| = \sum_{j=1}^d \nu_j$ and
-  $\partial^{\nu} g(x) = \partial^{|\nu|} g(x) \big/
-  \prod_{j=1}^d \partial x_j^{\nu_j}$. We say $g$ is Lipschitz if $g \in \cH^1$.
-
-\end{definition}
-
-\begin{assumption}[Data generation]%
-  \label{ass:mondrian_data}
-
-  Fix $d \geq 1$ and let $(X_i, Y_i)$ be i.i.d.\ samples from a distribution on
-  $\R^d \times \R$, writing $\bX = (X_1, \ldots, X_n)$ and
-  $\bY = (Y_1, \ldots, Y_n)$. Suppose $X_i$ has a Lebesgue density function
-  $f(x)$ on $[0,1]^d$ which is bounded away from zero and satisfies
-  $f \in \cH^\beta$ for some $\beta \geq 1$. Suppose $\E[Y_i^2 \mid X_i]$ is
-  bounded, let $\mu(X_i) = \E[Y_i \mid X_i]$, and assume $\mu \in \cH^\beta$.
-  Write $\varepsilon_i = Y_i - \mu(X_i)$ and assume
-  $\sigma^2(X_i) = \E[\varepsilon_i^2 \mid X_i]$
-  is Lipschitz and bounded away from zero.
-
-\end{assumption}
-
-Some comments are in order surrounding Assumption~\ref{ass:mondrian_data}. The
-requirement that the covariate density $f(x)$ be strictly positive on all of
-$[0,1]^d$ may seem strong, particularly when $d$ is moderately large. However,
-since our theory is presented pointwise in $x$, it is sufficient for this to
-hold only on some neighborhood of $x$. To see this, note that continuity
-implies the density is positive on some hypercube containing $x$. Upon
-rescaling the covariates, we can map this hypercube onto $[0,1]^d$. The same
-argument of course holds for the H{\"o}lder smoothness assumptions and the
-upper and lower bounds on the conditional variance function.
-
-\subsection{Mondrian random forests}
-\label{sec:mondrian_forests}
-
-We define the basic Mondrian random forest estimator
-\eqref{eq:mondrian_estimator} as in \citet{lakshminarayanan2014mondrian} and
-\citet{mourtada2020minimax}, and will later extend it to a debiased version in
-Section~\ref{sec:mondrian_debiased}. For a lifetime parameter $\lambda > 0$ and
-forest
-size $B \geq 1$, let $\bT = (T_1, \ldots, T_B)$ be a Mondrian forest where
-$T_b \sim \cM\big([0,1]^d, \lambda\big)$ are i.i.d.\ Mondrian trees
-which are independent of the data. For $x \in [0,1]^d$, write
-$N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ for the number of
-samples in $T_b(x)$, with $\I$ denoting an indicator function. Then the
-Mondrian random forest estimator of $\mu(x)$ is
-%
-\begin{equation}
-  \label{eq:mondrian_estimator}
-  \hat\mu(x) = \frac{1}{B} \sum_{b=1}^B
-  \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}.
-\end{equation}
-%
-If there are no samples $X_i$ in $T_b(x)$ then $N_b(x) = 0$, so we define
-$0/0 = 0$ (see Section~\ref{sec:mondrian_app_proofs} for details). To ensure the
-bias and variance of the Mondrian random forest estimator converge to zero (see
-Section~\ref{sec:mondrian_inference}), and to avoid boundary issues, we impose
-some basic conditions on $x$, $\lambda$, and $B$ in
-Assumption~\ref{ass:mondrian_estimator}.
-
-\begin{assumption}[Mondrian random forest estimator]%
-  \label{ass:mondrian_estimator}
-  %
-  Suppose $x \in (0,1)^d$ is an interior point of the support of $X_i$,
-  $\frac{\lambda^d}{n} \to 0$,
-  $\log \lambda \asymp \log n$,
-  and $B \asymp n^{\xi}$ for some $\xi \in (0, 1)$,
-  which may depend on the dimension $d$ and smoothness $\beta$.
-  %
-\end{assumption}
-
-Assumption~\ref{ass:mondrian_estimator} implies that the size of the forest $B$
-grows
-with $n$. For the purpose of mitigating the computational burden, we suggest
-the sub-linear polynomial growth $B \asymp n^{\xi}$, satisfying the conditions
-imposed in our main results. Large forests usually do not present computational
-challenges in practice as the ensemble estimator is easily parallelizable over
-the trees. We emphasize places where this ``large forest'' condition is
-important to our theory as they arise throughout the chapter.
-
-\section{Inference with Mondrian random forests}%
-\label{sec:mondrian_inference}
-
-We begin with a bias--variance decomposition for the Mondrian random
-forest estimator:
-%
-\begin{align}
-  \nonumber
-  \hat\mu(x) - \mu(x)
-  &=
-  \Big( \hat\mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big]\Big)
-  + \Big( \E \big[ \hat \mu(x) \mid \bX, \bT \big] - \mu(x)\Big) \\
-  &=
-  \nonumber
-  \frac{1}{B} \sum_{b=1}^B
-  \frac{\sum_{i=1}^n \varepsilon_i \, \I\big\{ X_i \in T_b(x) \big\}}
-  {N_b(x)} \\
-  \label{eq:mondrian_bias_variance}
-  &\quad+
-  \frac{1}{B} \sum_{b=1}^B
-  \frac{\sum_{i=1}^n \big(\mu(X_i) - \mu(x)\big) \,
-  \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}.
-\end{align}
-%
-Our approach to inference is summarized as follows. Firstly, we provide a
-central limit theorem (weak convergence to a Gaussian) for the first
-``variance'' term in \eqref{eq:mondrian_bias_variance}. Secondly, we precisely
-compute
-the probability limit of the second ``bias'' term. By ensuring that the
-standard deviation dominates the bias, a corresponding
-central limit theorem holds for the Mondrian random forest. With an appropriate
-estimator for the limiting variance, we establish procedures for valid and
-feasible statistical inference on the unknown regression function $\mu(x)$.
-
-We begin with the aforementioned central limit theorem, which forms the core of
-our methodology for performing statistical inference. Before stating our main
-result, we highlight some of the challenges involved. At first glance, the
-summands in the first term in \eqref{eq:mondrian_bias_variance} seem to be
-independent
-over $1 \leq i \leq n$, conditional on the forest $\bT$, depending only on
-$X_i$ and $\varepsilon_i$. However, the $N_b(x)$ appearing in the denominator
-depends on all $X_i$ simultaneously, violating this independence assumption and
-rendering classical central limit theorems inapplicable. A natural preliminary
-attempt to resolve this issue is to observe that
-%
-\begin{equation*}
-  N_b(x)= \sum_{i=1}^{n} \I\big\{X_i \in T_b(x)\big\}
-  \approx n \, \P \big( X_i \in T_b(x) \mid T_b \big)
-  \approx n f(x) |T_b(x)|
-\end{equation*}
-%
-with high probability. One could attempt to use this by approximating the
-estimator with an average of i.i.d.\ random variables, or by employing a
-central limit theorem conditional on $\bX$ and $\bT$. However, such an approach
-fails because $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$; the possible
-existence of small cells causes the law of the inverse cell volume to have
-heavy tails. For similar reasons, attempts to directly establish a central
-limit theorem based on $2 + \delta$ moments, such as the Lyapunov central limit
-theorem, are ineffective.
-
-We circumvent these problems by directly analyzing
-$\frac{\I\{N_b(x) \geq 1\}}{N_b(x)}$. We establish concentration properties for
-this non-linear function of $X_i$ via the Efron--Stein inequality
-\citep[Section 3.1]{boucheron2013concentration} along with a sequence of
-somewhat delicate preliminary lemmas regarding inverse moments of truncated
-(conditional) binomial random variables. In particular, we show that
-$\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)} \right]
-\lesssim \frac{\lambda^d}{n}$ and
-$\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)^2} \right]
-\lesssim \frac{\lambda^{2d} \log n}{n^2}$.
-Asymptotic normality is then established using a central limit theorem for
-martingale difference sequences \citep[Theorem~3.2]{hall1980martingale} with
-respect to an appropriate filtration.
-Section~\ref{sec:mondrian_overview_proofs} gives
-an overview our proof strategy in which we further discuss the underlying
-challenges, while Section~\ref{sec:mondrian_app_proofs} gives all the technical
-details.
-
-\subsection{Central limit theorem}
-\label{sec:mondrian_clt}
-
-Theorem~\ref{thm:mondrian_clt} gives our first main result.
-
-\begin{theorem}[Central limit theorem for the centered
-  Mondrian random forest estimator]%
-  \label{thm:mondrian_clt}
-  %
-  Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
-  hold, and further assume that
-  $\E[Y_i^4 \mid X_i ]$ is bounded almost surely
-  and $\frac{\lambda^d \log n}{n} \to 0$. Then
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \Big( \hat \mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big] \Big)
-    &\rightsquigarrow \cN\big(0, \Sigma(x)\big)
-    & &\text{where}
-    &\Sigma(x) &=
-    \frac{\sigma^2(x)}{f(x)} \left( \frac{4 - 4 \log 2}{3 } \right)^d.
-  \end{align*}
-\end{theorem}
-
-The condition of $B \to \infty$ is crucial, ensuring sufficient ``mixing'' of
-different Mondrian cells to escape the heavy-tailed phenomenon detailed in the
-preceding discussion. For concreteness, the large forest condition allows us to
-deal with expressions such as
-$\E \left[ \frac{1}{|T_b(x)| |T_{b'}(x)|} \right]
-= \E \left[ \frac{1}{|T_b(x)|} \right] \E \left[ \frac{1}{|T_{b'}(x)|} \right]
-\approx \lambda^{2d} < \infty$
-where $b \neq b'$, by independence of the trees, rather than the ``no
-ensembling'' single tree analog
-$\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$.
-
-We take this opportunity to contrast Mondrian random forests with more
-classical kernel-based smoothing methods. The lifetime $\lambda$ plays a
-similar role to the inverse bandwidth in determining the effective sample size
-$n / \lambda^d$, and thus the associated rate of convergence. However, due to
-the Mondrian process construction, some cells are typically ``too small''
-(equivalent to an insufficiently large bandwidth) to give an appropriate
-effective sample size. Similarly, classical methods based on non-random
-partitioning such as spline estimators \citep{huang2003local,cattaneo2020large}
-typically impose a quasi-uniformity assumption to ensure all the cells are of
-comparable size, a property which does not hold for the Mondrian process (not
-even with probability approaching one).
-
-\subsection*{Bias characterization}
-
-We turn to the second term in \eqref{eq:mondrian_bias_variance}, which captures
-the bias
-of the Mondrian random forest estimator conditional on the covariates $\bX$ and
-the forest $\bT$. As such, it is a random quantity which, as we will
-demonstrate, converges in probability. We precisely characterize the limiting
-non-random bias, including high-degree polynomials in $\lambda$ which for now
-may seem ignorable. Indeed the magnitude of the bias is determined by its
-leading term, typically of order $1/\lambda^2$ whenever $\beta \geq 2$, and
-this suffices for ensuring a negligible contribution from the bias with an
-appropriate choice of lifetime parameter. However, the advantage of specifying
-higher-order bias terms is made apparent in Section~\ref{sec:mondrian_debiased}
-when we
-construct a debiased Mondrian random forest estimator. There, we target and
-annihilate the higher-order terms in order to furnish superior estimation and
-inference properties.
-Theorem~\ref{thm:mondrian_bias} gives our main result on
-the bias of the Mondrian random forest estimator.
-
-\begin{theorem}[Bias of the Mondrian random forest estimator]%
-  \label{thm:mondrian_bias}
-  %
-  Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
-  hold.
-  Then for each $1 \leq r \leq \lfloor \flbeta / 2 \rfloor$ there exists
-  $B_r(x) \in \R$, which is a function only of
-  the derivatives of $f$ and $\mu$ at $x$ up to order $2r$, with
-  %
-  \begin{equation*}
-    \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-    = \mu(x)
-    + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
-    \frac{B_r(x)}{\lambda^{2r}}
-    + O_\P \left(
-      \frac{1}{\lambda^\beta}
-      + \frac{1}{\lambda \sqrt B}
-      + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{equation*}
-  %
-  Whenever $\beta > 2$ the leading bias is the quadratic term
-  %
-  \begin{equation*}
-    \frac{B_1(x)}{\lambda^2}
-    =
-    \frac{1}{2 \lambda^2}
-    \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2}
-    + \frac{1}{2 \lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}.
-  \end{equation*}
-  %
-  If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$,
-  and using multi-index notation we have
-  %
-  \begin{equation*}
-    \frac{B_r(x)}{\lambda^{2r}}
-    = \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x)
-    \prod_{j=1}^d \frac{1}{\nu_j + 1}.
-  \end{equation*}
-  %
-\end{theorem}
-
-In Theorem~\ref{thm:mondrian_bias} we give some explicit examples of
-calculating the
-limiting bias if $\beta > 2$ or when $X_i$ are uniformly distributed. The
-general form of $B_r(x)$ is provided in Section~\ref{sec:mondrian_app_proofs}
-but
-is somewhat unwieldy except in specific situations. Nonetheless the most
-important properties are that $B_r(x)$ are non-random and do not depend on the
-lifetime $\lambda$, crucial facts for our debiasing procedure given in
-Section~\ref{sec:mondrian_debiased}. If the forest size $B$ does not diverge to
-infinity
-then we suffer the first-order bias term $\frac{1}{\lambda \sqrt B}$. This
-phenomenon was explained by \citet{mourtada2020minimax}, who noted that it
-allows single Mondrian trees to achieve minimax optimality only when
-$\beta \in (0, 1]$. Large forests remove this first-order bias
-and are optimal for all $\beta \in (0, 2]$.
-
-Using Theorem~\ref{thm:mondrian_clt} and Theorem~\ref{thm:mondrian_bias}
-together,
-along with an appropriate choice of lifetime parameter $\lambda$,
-gives a central limit theorem for the Mondrian random forest estimator
-which can be used, for example, to build confidence intervals
-for the unknown regression function $\mu(x)$
-whenever the bias shrinks faster than the standard deviation.
-In general this will require
-$\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
-\ll \sqrt{\frac{\lambda^d}{n}}$,
-which can be satisfied by imposing the restrictions
-$\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$
-and $B \gg n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$
-on the lifetime $\lambda$ and forest size $B$.
-If instead we aim for optimal point estimation,
-then balancing the bias and standard deviation requires
-$\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
-\asymp \sqrt{\frac{\lambda^d}{n}}$,
-which can be satisfied by
-$\lambda \asymp n^{\frac{1}{d + 2(2 \wedge \beta)}}$
-and $B \gtrsim n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$.
-Such a choice of $\lambda$ gives the convergence rate
-$n^{\frac{-(2 \wedge \beta)}{d + 2(2 \wedge \beta)}}$
-which is the minimax-optimal rate of convergence \citep{stone1982optimal}
-for $\beta$-H{\"o}lder functions with $\beta \in (0,2]$
-as shown by \citet[Theorem~2]{mourtada2020minimax}.
-In Section~\ref{sec:mondrian_debiased} we will show how the Mondrian random
-forest
-estimator can be debiased, giving both weaker lifetime conditions for inference
-and also improved rates of convergence, under additional smoothness assumptions.
-
-\subsection*{Variance estimation}
-
-The limiting variance $\Sigma(x)$ from the resulting central limit theorem
-depends on the unknown quantities $\sigma^2(x)$ and $f(x)$.
-To conduct feasible inference, we must therefore first estimate
-$\Sigma(x)$. To this end, define
-%
-\begin{align}
-  \label{eq:mondrian_sigma2_hat}
-  \hat\sigma^2(x)
-  &=
-  \frac{1}{B} \sum_{b=1}^{B} \sum_{i=1}^n
-  \frac{\big(Y_i - \hat \mu(x)\big)^2 \, \I\{X_i \in T_b(x)\}} {N_b(x)}, \\
-  \nonumber
-  \hat\Sigma(x)
-  &=
-  \hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n
-  \left( \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_b(x)\}}{N_b(x)} \right)^2.
-\end{align}
-%
-In Theorem~\ref{thm:mondrian_variance_estimation} we show that this
-estimator is consistent, and establish its rate of convergence.
-%
-\begin{theorem}[Variance estimation]%
-  \label{thm:mondrian_variance_estimation}
-  Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
-  and
-  suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then
-  %
-  \begin{align*}
-    \hat\Sigma(x)
-    = \Sigma(x)
-    + O_\P \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}}
-    \right).
-  \end{align*}
-
-\end{theorem}
-
-\subsection{Confidence intervals}
-
-Theorem~\ref{thm:mondrian_confidence} shows how to construct valid confidence
-intervals
-for the regression function $\mu(x)$ under the lifetime and forest size
-assumptions previously discussed. For details on feasible and practical
-selection of the lifetime parameter $\lambda$, see
-Section~\ref{sec:mondrian_parameter_selection}.
-%
-\begin{theorem}[Feasible confidence intervals using a Mondrian random forest]%
-  \label{thm:mondrian_confidence}
-  %
-  Suppose that Assumptions~\ref{ass:mondrian_data} and
-  \ref{ass:mondrian_estimator} hold,
-  $\E[Y_i^4 \mid X_i ]$ is bounded almost surely,
-  and $\frac{\lambda^d \log n}{n} \to 0$. Assume that
-  $\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$
-  and $B \gg n^{\frac{2 (2 \wedge \beta) - 2}{d + 2 (2 \wedge \beta)}}$.
-  For a confidence level $\alpha \in (0, 1)$,
-  let $q_{1 - \alpha / 2}$ be the normal quantile satisfying
-  $\P \left( \cN(0, 1) \leq q_{1 - \alpha / 2} \right) = 1 - \alpha / 2$. Then
-  %
-  \begin{align*}
-    \P \left(
-      \mu(x) \in
-      \left[
-        \hat \mu(x)
-        - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2}
-        q_{1 - \alpha / 2}, \
-        \hat \mu(x)
-        + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2}
-        q_{1 - \alpha / 2}
-      \right]
-    \right)
-    \to
-    1 - \alpha.
-  \end{align*}
-
-\end{theorem}
-
-When coupled with an appropriate lifetime selection method,
-Theorem~\ref{thm:mondrian_confidence} gives a fully feasible procedure for
-uncertainty
-quantification in Mondrian random forests. Our procedure requires no adjustment
-of the original Mondrian random forest estimator beyond ensuring that the bias
-is negligible, and in particular does not rely on sample splitting. The
-construction of confidence intervals is just one corollary of the weak
-convergence result given in Theorem~\ref{thm:mondrian_clt}, and follows
-immediately from Slutsky's theorem
-\citep[Chapter~7]{pollard2002user}
-with a consistent variance estimator. Other applications
-include hypothesis testing on the value of $\mu(x)$ at a design point $x$ by
-inversion of the confidence interval, as well as parametric specification
-testing by comparison with a $\sqrt{n}$-consistent parametric regression
-estimator. The construction of simultaneous confidence intervals for finitely
-many points $x_1, \ldots, x_D$ can be accomplished either using standard
-multiple testing corrections or by first establishing a multivariate central
-limit theorem using the Cram{\'e}r--Wold device
-\citep[Chapter~8]{pollard2002user}
-and formulating a consistent multivariate variance estimator.
-
-\section{Overview of proof strategy}%
-\label{sec:mondrian_overview_proofs}
-
-This section provides some insight into the general approach we use to
-establish the main results in the preceding sections. We focus on the technical
-innovations forming the core of our arguments, and refer the reader to
-Section~\ref{sec:mondrian_app_proofs} for detailed proofs, including those for
-the
-debiased estimator discussed in the upcoming
-Section~\ref{sec:mondrian_debiased}.
-
-\subsection*{Preliminary results}
-
-The starting point for our proofs is a characterization of the exact
-distribution of the shape of a Mondrian cell $T(x)$. This property is a direct
-consequence of the fact that the restriction of a Mondrian process to a subcell
-remains Mondrian \citep[Fact~2]{mourtada2020minimax}. We have
-%
-\begin{align*}
-  |T(x)_j|
-  &= \left( \frac{E_{j1}}{\lambda} \wedge x_j \right)
-  + \left( \frac{E_{j2}}{\lambda} \wedge (1-x_j) \right)
-\end{align*}
-%
-for all $1 \leq j \leq d$, recalling that $T(x)_j$ is the side of the cell
-$T(x)$ aligned with axis $j$, and where $E_{j1}$ and $E_{j2}$ are mutually
-independent $\Exp(1)$ random variables. Our assumptions that $x \in (0,1)$ and
-$\lambda \to \infty$ make the boundary terms $x_j$ and $1-x_j$
-eventually ignorable so
-%
-\begin{align*}
-  |T(x)_j| &= \frac{E_{j1} + E_{j2}}{\lambda}
-\end{align*}
-%
-with high probability. Controlling the size of the largest cell in the forest
-containing $x$ is now straightforward with a union bound, exploiting the sharp
-tail decay of the exponential distribution, and thus
-%
-\begin{align*}
-  \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j|
-  \lesssim_\P \frac{\log B}{\lambda}.
-\end{align*}
-%
-This shows that up to logarithmic terms, none of the cells in the forest at $x$
-are significantly larger than average, ensuring that the Mondrian random forest
-estimator is localized around $x$ on the scale of $1/\lambda$, an important
-property for the upcoming bias characterization.
-
-Having provided upper bounds for the sizes of Mondrian cells, we also must
-establish some lower bounds in order to quantify the ``small cell'' phenomenon
-mentioned previously. The first step towards this is to bound the first two
-moments of the truncated inverse Mondrian cell volume; we show that
-%
-\begin{align*}
-  \E\left[ 1 \wedge \frac{1}{n |T(x)|} \right]
-  &\asymp \frac{\lambda^d}{n}
-  &&\text{and}
-  &\frac{\lambda^{2d}}{n^2}
-  &\lesssim
-  \E\left[ 1 \wedge \frac{1}{n^2 |T(x)|^2} \right]
-  \lesssim \frac{\lambda^{2d} \log n}{n^2}.
-\end{align*}
-%
-These bounds are computed directly using the exact distribution of $|T(x)|$.
-Note that $\E\left[ \frac{1}{|T(x)|^2} \right] = \infty$ because
-$\frac{1}{E_{j1} + E_{j2}}$ has only $2 - \delta$ finite moments, so the
-truncation is crucial here. Since we nearly have two moments, this
-truncation is at the expense of only a logarithmic term. Nonetheless, third and
-higher truncated moments will not enjoy such tight bounds, demonstrating both
-the fragility of this result and the inadequacy of tools such as the Lyapunov
-central limit theorem which require $2 + \delta$ moments.
-
-To conclude this investigation into the small cell phenomenon, we apply the
-previous bounds to ensure that the empirical effective sample sizes
-$N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ are approximately
-of the order $n / \lambda^d$ in an appropriate sense; we demonstrate that
-%
-\begin{align*}
-  \E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \right]
-  &\lesssim \frac{\lambda^d}{n}
-  &&\text{and}
-  &\E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)^2} \right]
-  &\lesssim \frac{\lambda^{2d} \log n}{n^2},
-\end{align*}
-%
-as well as similar bounds for mixed terms such as
-%
-$\E \left[
-  \frac{\I\{N_b(x) \geq 1\}}{N_b(x)}
-  \frac{\I\{N_{b'}(x) \geq 1\}}{N_{b'}(x)}
-\right]
-\lesssim \frac{\lambda^{2d}}{n^2}$
-%
-when $b \neq b'$, which arise from covariance terms across multiple trees. The
-proof of this result is involved and technical, and proceeds by induction. The
-idea is to construct a class of subcells by taking all possible intersections
-of the cells in $T_b$ and $T_{b'}$ (we show two trees here for clarity; there
-may be more) and noting that each $N_b(x)$ is the sum of the number of points
-in each such refined cell intersected with $T_b(x)$. We then swap out each
-refined cell one at a time and replace the number of data points it contains
-with its volume multiplied by $n f(x)$, showing that the expectation on the
-left hand side does not increase too much using a moment bound for inverse
-binomial random variables based on Bernstein's inequality. By induction and
-independence of the trees, eventually the problem is reduced to computing
-moments of truncated inverse Mondrian cell volumes, as above.
-
-\subsection*{Central limit theorem}
-
-To prove our main central limit theorem result
-(Theorem~\ref{thm:mondrian_clt}), we use
-the martingale central limit theorem given by
-\citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define
-$\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and
-$(\varepsilon_j : 1 \leq j \leq i)$, noting that
-$\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases as $n$ increases.
-Define the $\cH_{n i}$-measurable and square integrable variables
-%
-\begin{align*}
-  S_i(x) &=
-  \sqrt{\frac{n}{\lambda^d}} \frac{1}{B} \sum_{b=1}^B
-  \frac{\I \{X_i \in T_b(x)\} \varepsilon_i} {N_{b}(x)},
-\end{align*}
-%
-which satisfy the martingale difference property
-$\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further,
-%
-\begin{align*}
-  \sqrt{\frac{n}{\lambda^d}}
-  \big(
-    \hat\mu(x)
-    - \E\left[
-      \hat\mu(x) \mid \bX, \bT
-    \right]
-  \big)
-  = \sum_{i=1}^n S_i(x).
-\end{align*}
-%
-To establish weak convergence to $\cN\big(0, \Sigma(x)\big)$,
-it suffices to check that $\max_i |S_i(x)| \to 0$ in probability,
-$\E\left[\max_i S_i(x)^2\right] \lesssim 1$,
-and $\sum_i S_i(x)^2 \to \Sigma(x)$ in probability.
-Checking the first two of these is straightforward given the denominator moment
-bounds derived above. For the third condition, we demonstrate that
-$\sum_i S_i(x)^2$ concentrates by checking its variance is vanishing. To do
-this, first observe that $S_i(x)^2$ is the square of a sum over the $B$ trees.
-Expanding this square, we see that the diagonal terms (where $b = b'$) provide
-a negligible contribution due to the large forest assumption. For the other
-terms, we apply the law of total variance and the moment bounds detailed
-earlier. Here, it is crucial that $b \neq b'$ in order to exploit the
-independence of the trees and avoid having to control any higher moments. The
-law of total variance requires that we bound
-%
-\begin{align*}
-  \Var \left[
-    \E \left[
-      \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b}
-      \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2}
-      {N_{b}(x) N_{b'}(x)} \Bigm| \bX, \bY
-    \right]
-  \right],
-\end{align*}
-%
-which is the variance of a non-linear function of the i.i.d.\ variables
-$(X_i, \varepsilon_i)$, and so we apply the Efron--Stein inequality.
-The important insight here is that replacing a sample
-$(X_i, \varepsilon_i)$ with an independent copy
-$(\tilde X_i, \tilde \varepsilon_i)$ can change the value of
-$N_b(x)$ by at most one. Further, this can happen only on the event
-$\{ X_i \in T_{b}(x) \} \cup \{ \tilde X_i \in T_{b}(x) \}$,
-which occurs with probability on the order $1/\lambda^d$
-(the expected cell volume).
-
-The final part of the central limit theorem proof is to calculate the limiting
-variance $\Sigma(x)$. The penultimate step showed that we must have
-%
-\begin{align*}
-  \Sigma(x)
-  &= \lim_{n \to \infty} \sum_{i=1}^n \E \left[S_i(x)^2 \right]
-  = \lim_{n \to \infty}
-  \frac{n^2}{\lambda^d} \,
-  \E \left[
-    \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2}
-    {N_{b}(x) N_{b'}(x)}
-  \right],
-\end{align*}
-%
-assuming the limit exists, so it remains to check this and calculate the limit.
-It is a straightforward but tedious exercise to verify that each term can be
-replaced with its conditional expectation given $T_b$ and $T_{b'}$, using some
-further properties of the binomial and exponential distributions. This yields
-%
-\begin{align*}
-  \Sigma(x)
-  &=
-  \frac{\sigma^2(x)}{f(x)}
-  \lim_{\lambda \to \infty}
-  \frac{1}{\lambda^d}
-  \E \left[
-    \frac{|T_{b}(x) \cap T_{b'}(x)|}
-    {|T_{b}(x)| \, |T_{b'}(x)|}
-  \right]
-  = \frac{\sigma^2(x)}{f(x)}
-  \E \left[
-    \frac{(E_{1} \wedge E'_{1}) + (E_{2} \wedge E'_{2})}
-    {(E_{1} + E_{2}) (E'_{1} + E'_{2})}
-  \right]^d
-\end{align*}
-%
-where $E_1$, $E_2$, $E'_1$, and $E'_2$ are independent $\Exp(1)$,
-by the cell shape distribution and independence of the trees. This final
-expectation is calculated by integration, using various incomplete gamma
-function identities.
-
-\subsection*{Bias characterization}
-
-Our second substantial technical result is the bias characterization
-given as Theorem~\ref{thm:mondrian_bias}, in which we precisely
-characterize the probability limit of the conditional bias
-%
-\begin{align*}
-  \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-  - \mu(x)
-  &=
-  \frac{1}{B} \sum_{b=1}^B
-  \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
-  \frac{\I\{X_i \in T_b(x)\}}{N_b(x)}.
-\end{align*}
-%
-The first step is to pass to the ``infinite forest''
-limit by taking an expectation conditional on $\bX$, or equivalently
-marginalizing over $\bT$, applying the conditional Markov inequality
-to see
-%
-\begin{align*}
-  \big|
-  \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-  - \E \left[ \hat \mu(x) \mid \bX \right]
-  \big|
-  &\lesssim_\P
-  \frac{1}{\lambda \sqrt B}.
-\end{align*}
-%
-While this may seem a crude approximation, it is already known that fixed-size
-Mondrian forests have suboptimal bias properties when compared to forests with
-a diverging number of trees. In fact, the error $\frac{1}{\lambda \sqrt B}$
-exactly accounts for the first-order bias of individual Mondrian trees noted by
-\citet{mourtada2020minimax}.
-
-Next we show that $\E \left[ \hat \mu(x) \mid \bX \right]$ converges in
-probability to its expectation, again using the Efron--Stein theorem for this
-non-linear function of the i.i.d.\ variables $X_i$. The Lipschitz property of
-$\mu$ and the upper bound on the maximum cell size give
-$|\mu(X_i) - \mu(x)| \lesssim \max_{1 \leq j \leq d} |T_b(x)_j|
-\lesssim_\P \frac{\log B}{\lambda}$
-whenever $X_i \in T_b(x)$,
-so we combine this with moment bounds for the denominator $N_b(x)$ to see
-%
-\begin{align*}
-  \left|
-  \E \left[ \hat \mu(x) \mid \bX \right]
-  - \E \left[ \hat \mu(x) \right]
-  \right|
-  \lesssim_\P
-  \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}.
-\end{align*}
-
-The next step is to approximate the resulting non-random bias
-$\E \left[ \hat \mu(x) \right] - \mu(x)$ as a polynomial in $1/\lambda$.
-To this end, we firstly apply a concentration-type result for the binomial
-distribution to deduce that
-%
-\begin{align*}
-  \E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \Bigm| \bT \right]
-  \approx \frac{1}{n \int_{T_b(x)} f(s) \diff s}
-\end{align*}
-%
-in an appropriate sense, and hence,
-by conditioning on $\bT$ and $\bX$ without $X_i$, we write
-%
-\begin{align}
-  \label{eq:mondrian_bias_ratio}
-  \E \left[ \hat \mu(x) \right] - \mu(x)
-  &\approx
-  \E \left[
-    \frac{\int_{T_b(x)} (\mu(s) - \mu(x)) f(s) \diff s}
-    {\int_{T_b(x)} f(s) \diff s}
-  \right].
-\end{align}
-%
-Next we apply the multivariate version of Taylor's theorem to the integrands in
-both the numerator and the denominator in \eqref{eq:mondrian_bias_ratio}, and
-then apply
-the Maclaurin series of $\frac{1}{1+x}$ and the multinomial theorem to recover
-a single polynomial in $1/\lambda$. The error term is on the order of
-$1/\lambda^\beta$ and depends on the smoothness of $\mu$ and $f$, and the
-polynomial coefficients are given by various expectations involving exponential
-random variables. The final step is to verify using symmetry of Mondrian cells
-that all the odd monomial coefficients are zero, and to calculate some explicit
-examples of the form of the limiting bias.
-
-\section{Debiased Mondrian random forests}%
-\label{sec:mondrian_debiased}
-
-In this section we give our next main contribution, proposing a variant of the
-Mondrian random forest estimator which corrects for higher-order bias with an
-approach based on generalized jackknifing \citep{schucany1977improvement}. This
-estimator retains the basic form of a Mondrian random forest estimator in the
-sense that it is a linear combination of Mondrian tree estimators, but in this
-section we allow for non-identical linear coefficients, some of which may be
-negative, and for differing lifetime parameters across the trees. Since the
-basic Mondrian random forest estimator is a special case of this more general
-debiased version, we will discuss only the latter throughout the rest of the
-chapter.
-
-We use the explicit form of the bias given in Theorem~\ref{thm:mondrian_bias} to
-construct a debiased version of the Mondrian forest estimator. Let $J \geq 0$
-be the bias correction order. As such, with $J=0$ we retain the original
-Mondrian forest estimator, with $J=1$ we remove second-order bias, and with
-$J = \lfloor\flbeta / 2 \rfloor$ we remove bias terms up to and including order
-$2 \lfloor\flbeta / 2 \rfloor$, giving the maximum possible bias reduction
-achievable in the H{\"o}lder class $\cH^\beta$. As such, only bias terms of
-order $1/\lambda^\beta$ will remain.
-
-For $0 \leq r \leq J$ let $\hat \mu_r(x)$ be a Mondrian forest estimator
-based on the trees $T_{b r} \sim \cM\big([0,1]^d, \lambda_r \big)$
-for $1 \leq b \leq B$, where $\lambda_r = a_r \lambda$ for some $a_r > 0$
-and $\lambda > 0$. Write $\bT$ to denote the collection of all the trees,
-and suppose they are mutually independent. We find values of $a_r$ along with
-coefficients $\omega_r$ in order to annihilate the leading $J$ bias terms of
-the debiased Mondrian random forest estimator
-%
-\begin{align}
-  \label{eq:mondrian_debiased}
-  \hat \mu_\rd(x)
-  &= \sum_{r=0}^J \omega_r \hat \mu_r(x)
-  = \sum_{r=0}^{J} \omega_r
-  \frac{1}{B} \sum_{b=1}^B
-  \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_{r b}(x) \big\}} {N_{r b}(x)}.
-\end{align}
-%
-This ensemble estimator retains the ``forest'' structure of the original
-estimators, but with varying lifetime parameters $\lambda_r$ and coefficients
-$\omega_r$. Thus by Theorem~\ref{thm:mondrian_bias} we want to solve
-%
-\begin{align*}
-  \sum_{r=0}^{J} \omega_r
-  \left( \mu(x) + \sum_{s=1}^{J} \frac{B_{s}(x)}{a_r^{2s} \lambda^{2s}} \right)
-  &= \mu(x)
-\end{align*}
-%
-for all $\lambda$, or equivalently the system of linear equations
-$\sum_{r=0}^{J} \omega_r = 1$
-and $\sum_{r=0}^{J} \omega_r a_r^{-2s} = 0$ for each $1 \leq s \leq J$.
-We solve these as follows. Define the $(J+1) \times (J+1)$ Vandermonde matrix
-$A_{r s} = a_{r-1}^{2-2s}$,
-and let $\omega = (\omega_0, \ldots, \omega_J)^\T \in \R^{J+1}$
-and $e_0 = (1, 0, \ldots, 0)^\T \in \R^{J+1}$.
-Then a solution for the debiasing coefficients is given by
-$\omega = A^{-1} e_0$ whenever $A$ is non-singular.
-In practice we can take $a_r$ to be a fixed geometric or arithmetic sequence
-to ensure this is the case, appealing to the Vandermonde determinant formula:
-$\det A = \prod_{0 \leq r < s \leq J} (a_r^{-2} - a_s^{-2})
-\neq 0$ whenever $a_r$ are distinct. For example, we could set
-$a_r = (1 + \gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$.
-Because we assume $\beta$, and therefore the choice of $J$, do not
-depend on $n$, there is no need to quantify
-the invertibility of $A$ by, for example, bounding its eigenvalues
-away from zero as a function of $J$.
-
-\subsection{Central limit theorem}
-
-In Theorem~\ref{thm:mondrian_clt_debiased}, we verify that a central
-limit theorem holds for the debiased
-random forest estimator $\hat\mu_\rd(x)$ and give its limiting variance.
-The strategy and challenges associated with proving
-Theorem~\ref{thm:mondrian_clt_debiased} are identical to those discussed earlier
-surrounding Theorem~\ref{thm:mondrian_clt}. In fact in
-Section~\ref{sec:mondrian_app_proofs}
-we provide a direct proof only for Theorem~\ref{thm:mondrian_clt_debiased}
-and deduce Theorem~\ref{thm:mondrian_clt} as a special case.
-
-\begin{theorem}[Central limit theorem for the
-  debiased Mondrian random forest estimator]%
-  \label{thm:mondrian_clt_debiased}
-  %
-  Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
-  hold,
-  $\E[Y_i^4 \mid X_i ]$ is bounded,
-  and $\frac{\lambda^d \log n}{n} \to 0$. Then
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \Big(
-      \hat \mu_\rd(x)
-      - \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
-    \Big)
-    &\rightsquigarrow
-    \cN\big(0, \Sigma_\rd(x)\big)
-  \end{align*}
-  %
-  where, with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}}
-  \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$,
-  the limiting variance is
-  %
-  \begin{align*}
-    \Sigma_\rd(x)
-    &=
-    \frac{\sigma^2(x)}{f(x)}
-    \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'}
-    \left( \ell_{r r'} + \ell_{r' r} \right)^d.
-  \end{align*}
-  %
-\end{theorem}
-
-It is easy to verify that in the case of no debiasing we have
-$J=0$ and $a_0 = \omega_0 = 1$, yielding
-$\Sigma_\rd(x) = \Sigma(x)$, and recovering Theorem~\ref{thm:mondrian_clt}.
-
-\subsection*{Bias characterization}
-
-In Theorem~\ref{thm:mondrian_bias_debiased} we verify that this debiasing
-procedure does indeed annihilate the desired bias terms, and its proof is a
-consequence of Theorem~\ref{thm:mondrian_bias} and the construction of the
-debiased Mondrian random forest estimator $\hat\mu_\rd(x)$.
-
-\begin{theorem}[Bias of the debiased Mondrian random forest estimator]%
-  \label{thm:mondrian_bias_debiased}
-  Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}.
-  In the notation of Theorem~\ref{thm:mondrian_bias} with
-  $\bar\omega = \sum_{r=0}^J \omega_r a_r^{-2J - 2}$,
-  %
-  \begin{align*}
-    \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
-    &= \mu(x) + \I\{2J+2 < \beta \}
-    \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} \\
-    &\quad+
-    O_\P \left(
-      \frac{1}{\lambda^{2J + 4}}
-      + \frac{1}{\lambda^\beta}
-      + \frac{1}{\lambda \sqrt B}
-      + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{align*}
-  %
-\end{theorem}
-
-Theorem~\ref{thm:mondrian_bias_debiased} has the following consequence:
-the leading bias term is characterized in terms of
-$B_{J+1}(x)$ whenever $J < \beta/2 - 1$,
-or equivalently $J < \lfloor \flbeta/2 \rfloor$,
-that is, the debiasing order
-$J$ does not exhaust the H{\"o}lder smoothness $\beta$.
-If this condition does not hold, then the estimator is
-fully debiased, and the resulting leading bias
-term is bounded above by $1/\lambda^\beta$ up to constants,
-but its form is left unspecified.
-
-\subsection*{Variance estimation}
-
-As before, we propose a variance estimator in order to conduct feasible
-inference and show that it is consistent.
-With $\hat\sigma^2(x)$ as in \eqref{eq:mondrian_sigma2_hat}
-in Section~\ref{sec:mondrian_inference}, define the estimator
-%
-\begin{align}
-  \label{eq:mondrian_debiased_variance_estimator}
-  \hat\Sigma_\rd(x)
-  &=
-  \hat\sigma^2(x)
-  \frac{n}{\lambda^d}
-  \sum_{i=1}^n
-  \left(
-    \sum_{r=0}^J
-    \omega_r
-    \frac{1}{B}
-    \sum_{b=1}^B
-    \frac{\I\{X_i \in T_{r b}(x)\}}
-    {N_{r b}(x)}
-  \right)^2.
-\end{align}
-%
-\begin{theorem}[Variance estimation]%
-  \label{thm:mondrian_variance_estimation_debiased}
-  Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
-  and
-  suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then
-  %
-  \begin{align*}
-    \hat\Sigma_\rd(x)
-    = \Sigma_\rd(x)
-    + O_\P \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{1}{\sqrt B}
-      + \sqrt{\frac{\lambda^d \log n}{n}}
-    \right).
-  \end{align*}
-  %
-\end{theorem}
-
-\subsection{Confidence intervals}
-
-In analogy to Section~\ref{sec:mondrian_inference},
-we now demonstrate the construction of feasible valid confidence
-intervals using the debiased Mondrian random forest estimator
-in Theorem~\ref{thm:mondrian_confidence_debiased}.
-Once again we must ensure that the bias
-(now significantly reduced due to our debiasing procedure)
-is negligible when compared to the standard deviation
-(which is of the same order as before).
-We assume for simplicity that the estimator has been fully
-debiased by setting $J \geq \lfloor \flbeta / 2\rfloor$
-to yield a leading bias of order $1/\lambda^\beta$,
-but intermediate ``partially debiased'' versions can easily
-be provided, with leading bias terms of order
-$1/\lambda^{\beta \wedge (2J+2)}$ in general.
-We thus require
-$\frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
-\ll \sqrt{\frac{\lambda^d}{n}}$,
-which can be satisfied by imposing the restrictions
-$\lambda \gg n^{\frac{1}{d + 2 \beta}}$
-and $B \gg n^{\frac{2\beta - 2}{d + 2\beta}}$
-on the lifetime parameter $\lambda$
-and forest size $B$.
-
-\begin{theorem}[Feasible confidence intervals using a
-  debiased Mondrian random forest]%
-  \label{thm:mondrian_confidence_debiased}
-  %
-  Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
-  hold,
-  $\E[Y_i^4 \mid X_i ]$ is bounded,
-  and $\frac{\lambda^d \log n}{n} \to 0$.
-  Fix $J \geq \lfloor \flbeta / 2 \rfloor$ and assume that
-  $\lambda \gg n^{\frac{1}{d + 2 \beta}}$
-  and $B \gg n^{\frac{2 \beta - 2}{d + 2 \beta}}$.
-  For a confidence level $\alpha \in (0, 1)$,
-  let $q_{1 - \alpha / 2}$ be as in Theorem~\ref{thm:mondrian_confidence}. Then
-  %
-  \begin{align*}
-    \P \left(
-      \mu(x) \in
-      \left[
-        \hat \mu_\rd(x)
-        - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2}
-        q_{1 - \alpha / 2}, \
-        \hat \mu_\rd(x)
-        + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2}
-        q_{1 - \alpha / 2}
-      \right]
-    \right)
-    \to
-    1 - \alpha.
-  \end{align*}
-
-\end{theorem}
-
-One important benefit of our debiasing technique is made clear in
-Theorem~\ref{thm:mondrian_confidence_debiased}: the restrictions imposed on the
-lifetime
-parameter $\lambda$ are substantially relaxed, especially in smooth classes
-with large $\beta$. As well as the high-level of benefit of relaxed conditions,
-this is also useful for practical selection of appropriate lifetimes for
-estimation and inference respectively; see
-Section~\ref{sec:mondrian_parameter_selection} for more details. Nonetheless,
-such
-improvements do not come without concession. The limiting variance
-$\Sigma_\rd(x)$ of the debiased estimator is larger than that of the unbiased
-version (the extent of this increase depends on the choice of the debiasing
-parameters $a_r$), leading to wider confidence intervals and larger estimation
-error in small samples despite the theoretical asymptotic improvements.
-
-\subsection{Minimax optimality}
-
-Our final result Theorem~\ref{thm:mondrian_minimax} shows that,
-when using an appropriate sequence of lifetime parameters $\lambda$,
-the debiased Mondrian random forest estimator
-achieves, up to constants, the minimax-optimal rate of convergence
-for estimating a regression function $\mu \in \cH^\beta$
-in $d$ dimensions \citep{stone1982optimal}.
-This result holds for all $d \geq 1$ and all $\beta > 0$,
-complementing a previous result established only for $\beta \in (0, 2]$
-by \citet{mourtada2020minimax}.
-%
-\begin{theorem}[Minimax optimality of the debiased
-  Mondrian random forest estimator]%
-  \label{thm:mondrian_minimax}
-  Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
-  and let $J \geq \lfloor \flbeta / 2 \rfloor$,
-  $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$, and
-  $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$. Then
-  %
-  \begin{align*}
-    \E \left[
-      \big( \hat \mu_\rd(x) - \mu(x) \big)^2
-    \right]^{1/2}
-    &\lesssim
-    \sqrt{\frac{\lambda^d}{n}}
-    + \frac{1}{\lambda^\beta}
-    + \frac{1}{\lambda \sqrt B}
-    \lesssim
-    n^{-\frac{\beta}{d + 2 \beta}}.
-  \end{align*}
-  %
-\end{theorem}
-
-The sequence of lifetime parameters $\lambda$ required in
-Theorem~\ref{thm:mondrian_minimax} are chosen to balance the bias and standard
-deviation bounds implied by Theorem~\ref{thm:mondrian_bias_debiased} and
-Theorem~\ref{thm:mondrian_clt_debiased} respectively, in order to minimize the
-pointwise
-mean squared error. While selecting an optimal debiasing order $J$ needs only
-knowledge of an upper bound on the smoothness $\beta$, choosing an optimal
-sequence of $\lambda$ values does assume that $\beta$ is known a priori. The
-problem of adapting to $\beta$ from data is challenging and beyond the scope of
-this chapter; we provide some practical advice for tuning parameter
-selection in Section~\ref{sec:mondrian_parameter_selection}.
-
-Theorem~\ref{thm:mondrian_minimax} complements the minimaxity results proven by
-\citet{mourtada2020minimax} for Mondrian trees (with $\beta \leq 1$) and for
-Mondrian random forests (with $\beta \leq 2$), with one modification: our
-version is stated in pointwise rather than integrated mean squared error. This
-is because our debiasing procedure is designed to handle interior smoothing
-bias and so does not provide any correction for boundary bias. We leave
-the development of such boundary corrections to future work, but constructions
-similar to higher-order boundary-correcting kernels should be possible. If the
-region of integration is a compact set in the interior of $[0,1]^d$, then we do
-obtain an optimal integrated mean squared error bound: if $\delta \in (0, 1/2)$
-is fixed then under the same conditions as Theorem~\ref{thm:mondrian_minimax},
-%
-\begin{align*}
-  \E \left[
-    \int_{[\delta, 1-\delta]^d}
-    \big(
-      \hat \mu_\rd(x)
-      - \mu(x)
-    \big)^2
-    \diff x
-  \right]^{1/2}
-  &\lesssim
-  \sqrt{\frac{\lambda^d}{n}}
-  + \frac{1}{\lambda^\beta}
-  + \frac{1}{\lambda \sqrt B}
-  \lesssim
-  n^{-\frac{\beta}{d + 2 \beta}}.
-\end{align*}
-
-The debiased Mondrian random forest estimator defined in
-\eqref{eq:mondrian_debiased} is
-a linear combination of Mondrian random forests, and as such contains both a
-sum over $0 \leq r \leq J$, representing the debiasing procedure, and a sum
-over $1 \leq b \leq B$, representing the forest averaging. We have thus far
-been interpreting this estimator as a debiased version of the standard Mondrian
-random forest given in \eqref{eq:mondrian_estimator}, but it is
-equally valid to swap the order of these sums. This gives rise to an
-alternative point of view: we replace each Mondrian random tree with a
-``debiased'' version, and then take a forest of such modified trees. This
-perspective is more in line with existing techniques for constructing
-randomized ensembles, where the outermost operation represents a $B$-fold
-average of randomized base learners, not necessarily locally constant decision
-trees, each of which has a small bias component \citep{caruana2004ensemble,
-zhou2019deep, friedberg2020local}.
-
-\section{Tuning parameter selection}%
-\label{sec:mondrian_parameter_selection}
-
-We discuss various procedures for selecting the parameters involved in fitting
-a debiased Mondrian random forest; namely the base lifetime parameter
-$\lambda$, the number of trees in each forest $B$, the bias correction order
-$J$, and the debiasing scale parameters $a_r$ for $0 \leq r \leq J$.
-
-\subsection{Selecting the base lifetime parameter
-\texorpdfstring{$\lambda$}{lambda}}%
-\label{sec:mondrian_lifetime_selection}
-
-The most important parameter is the base Mondrian lifetime parameter $\lambda$,
-which plays the role of a complexity parameter and thus governs the overall
-bias--variance trade-off of the estimator. Correct tuning of $\lambda$ is
-especially important in two main respects:
-%
-firstly, in order to use the central limit theorem established in
-Theorem~\ref{thm:mondrian_clt_debiased}, we must have that the bias converges
-to zero,
-requiring $\lambda \gg n^{\frac{1}{d + 2\beta}}$.
-%
-Secondly, the minimax optimality result of Theorem~\ref{thm:mondrian_minimax}
-is valid only in the regime $\lambda \asymp n^{\frac{1}{d + 2\beta}}$, and thus
-requires careful determination in the more realistic finite-sample setting. For
-clarity, in this section we use the notation $\hat\mu_\rd(x; \lambda, J)$ for
-the debiased Mondrian random forest with lifetime $\lambda$ and debiasing order
-$J$ as in \eqref{eq:mondrian_debiased}.
-Similarly, write $\hat\Sigma_\rd(x; \lambda, J)$ for the associated
-variance estimator given in \eqref{eq:mondrian_debiased_variance_estimator}.
-
-For minimax-optimal point estimation when $\beta$ is known,
-choose any sequence $\lambda \asymp n^{\frac{1}{d + 2\beta}}$
-and use $\hat\mu_\rd(x; \lambda, J)$ with $J = \lfloor \flbeta / 2 \rfloor$,
-following the theory given in Theorem~\ref{thm:mondrian_minimax}.
-For an explicit example of how to choose the lifetime, one can instead use
-$\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J-1\big)$
-so that the leading bias is explicitly characterized by
-Theorem~\ref{thm:mondrian_bias_debiased},
-and with $\hat\lambda_{\AIMSE}(J-1)$ as defined below.
-This is no longer minimax-optimal as $J-1 < J$
-does not satisfy the conditions of Theorem~\ref{thm:mondrian_minimax}.
-
-For performing inference, a more careful procedure is required;
-we suggest the following method assuming $\beta > 2$.
-Set $J = \lfloor \flbeta / 2 \rfloor$ as before,
-and use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$
-and $\hat\Sigma_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$
-to construct a confidence interval.
-The reasoning for this is that we select a lifetime tailored for a more biased
-estimator than we actually use. This results in an inflated lifetime estimate,
-guaranteeing the resulting bias is negligible when it is plugged into the fully
-debiased estimator. This approach to tuning parameter selection and debiasing
-for valid nonparametric inference corresponds to an application of robust bias
-correction \citep{calonico2018effect,calonico2022coverage},
-where the point estimator is bias-corrected
-and the robust standard error estimator incorporates the additional
-sampling variability introduced by the bias correction.
-This leads to a more refined distributional approximation
-but does not necessarily exhaust the underlying
-smoothness of the regression function.
-An alternative inference approach based on Lepskii's method
-\citep{lepskii1992asymptotically,birge2001alternative}
-could be developed with the latter goal in mind.
-
-It remains to propose a concrete method for computing $\hat\lambda_{\AIMSE}(J)$
-in the finite-sample setting; we suggest two such procedures based on plug-in
-selection with polynomial estimation and cross-validation respectively,
-building on classical ideas from the nonparametric
-smoothing literature \citep{fan2020statistical}.
-
-\subsubsection*{Lifetime selection with polynomial estimation}
-
-Firstly, suppose $X_i \sim \Unif\big([0,1]^d\big)$
-and that the leading bias of $\hat\mu_\rd(x)$ is well approximated by an
-additively separable function so that,
-writing $\partial^{2 J + 2}_j \mu(x)$
-for $\partial^{2 J + 2}_j \mu(x) / \partial x_j^{2 J + 2}$,
-%
-\begin{align*}
-  \frac{\bar \omega B_{J+1}(x)}{\lambda^{2 J + 2}}
-  &\approx
-  \frac{1}{\lambda^{2 J + 2}}
-  \frac{\bar \omega }{J + 2}
-  \sum_{j=1}^d
-  \partial^{2 J + 2}_j \mu(x).
-\end{align*}
-%
-Now suppose the model is homoscedastic so $\sigma^2(x) = \sigma^2$ and
-the limiting variance of $\hat\mu_\rd$ is
-%
-\begin{align*}
-  \frac{\lambda^d}{n}
-  \Sigma_\rd(x)
-  &=
-  \frac{\lambda^d \sigma^2}{n}
-  \sum_{r=0}^{J}
-  \sum_{r'=0}^{J}
-  \omega_r
-  \omega_{r'}
-  \left( \ell_{r r'} + \ell_{r' r} \right)^d.
-\end{align*}
-%
-The asymptotic integrated mean squared error (AIMSE) is
-%
-\begin{align*}
-  \AIMSE(\lambda, J)
-  &=
-  \frac{1}{\lambda^{4 J + 4}}
-  \frac{\bar \omega^2}{(J + 2)^2}
-  \int_{[0,1]^d}
-  \left(
-    \sum_{j=1}^d
-    \partial^{2 J + 2}_j \mu(x)
-  \right)^2
-  \diff x \\
-  &\quad+
-  \frac{\lambda^d \sigma^2}{n}
-  \sum_{r=0}^{J}
-  \sum_{r'=0}^{J}
-  \omega_r
-  \omega_{r'}
-  \left( \ell_{r r'} + \ell_{r' r} \right)^d.
-\end{align*}
-%
-Minimizing over $\lambda > 0$ yields the AIMSE-optimal lifetime parameter
-%
-\begin{align*}
-  \lambda_{\AIMSE}(J)
-  &=
-  \left(
-    \frac{
-      \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2}
-      n \int_{[0,1]^d}
-      \left(
-        \sum_{j=1}^d
-        \partial^{2 J + 2}_j \mu(x)
-      \right)^2
-      \diff x
-    }{
-      d \sigma^2
-      \sum_{r=0}^{J}
-      \sum_{r'=0}^{J}
-      \omega_r
-      \omega_{r'}
-      \left( \ell_{r r'} + \ell_{r' r} \right)^d
-    }
-  \right)^{\frac{1}{4 J + 4 + d}}.
-\end{align*}
-%
-An estimator of $\lambda_{\AIMSE}(J)$ is therefore given by
-%
-\begin{align*}
-  \hat\lambda_{\AIMSE}(J)
-  &=
-  \left(
-    \frac{
-      \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2}
-      \sum_{i=1}^n
-      \left(
-        \sum_{j=1}^d
-        \partial^{2 J + 2}_j \hat\mu(X_i)
-      \right)^2
-    }{
-      d \hat\sigma^2
-      \sum_{r=0}^{J}
-      \sum_{r'=0}^{J}
-      \omega_r
-      \omega_{r'}
-      \left( \ell_{r r'} + \ell_{r' r} \right)^d
-    }
-  \right)^{\frac{1}{4 J + 4 + d}}
-\end{align*}
-%
-for some preliminary estimators
-$\partial^{2 J + 2}_j \hat\mu(x)$ and $\hat\sigma^2$.
-These can be obtained by fitting a global polynomial regression
-to the data of order $2 J + 4$ without interaction terms.
-To do this, define the $n \times ((2 J + 4)d + 1)$ design matrix $P$ with rows
-%
-\begin{align*}
-  P_i = \big(
-    1, X_{i1}, X_{i1}^2, \ldots, X_{i1}^{2 J + 4},
-    X_{i2}, X_{i2}^2, \ldots, X_{i2}^{2 J + 4},
-    \ldots,
-    X_{id}, X_{id}^2, \ldots, X_{id}^{2 J + 4}
-  \big),
-\end{align*}
-%
-and let
-%
-$P_x = \big(
-  1, x_{1}, x_{1}^2, \ldots, x_{1}^{2 J + 4},
-  x_{2}, x_{2}^2, \ldots, x_{2}^{2 J + 4},
-  \ldots,
-  x_{d}, x_{d}^2, \ldots, x_{d}^{2 J + 4}
-\big).
-$
-%
-Then we define the derivative estimator as
-%
-\begin{align*}
-  \partial^{2 J + 2}_j \hat\mu(x)
-  &=
-  \partial^{2 J + 2}_j P_x
-  \big( P^\T P \big)^{-1}
-  P^\T \bY \\
-  &=
-  (2J + 2)!
-  \left(
-    0_{1 + (j-1)(2 J + 4) + (2J + 1)},
-    1, x_j, x_j^2 / 2,
-    0_{(d-j)(2 J + 4)}
-  \right)
-  \big( P^\T P \big)^{-1}
-  P^\T \bY,
-\end{align*}
-%
-and the variance estimator $\hat\sigma^2$ is
-based on the residual sum of squared errors of this model:
-%
-\begin{align*}
-  \hat\sigma^2
-  &=
-  \frac{1}{n - (2J + 4)d - 1}
-  \big(
-    \bY^\T \bY
-    - \bY^\T P \big( P^\T P \big)^{-1} P^\T \bY
-  \big).
-\end{align*}
-
-\subsubsection*{Lifetime selection with cross-validation}
-
-As an alternative to the analytic plug-in methods described above, one can use
-a cross-validation approach. While leave-one-out cross-validation (LOOCV) can
-be applied directly \citep{fan2020statistical},
-the linear smoother structure of the (debiased) Mondrian
-random forest estimator allows a computationally simpler formulation. Writing
-$\hat\mu_\rd^{-i}(x)$ for a debiased Mondrian random forest estimator fitted
-without the $i$th data sample, it is easy to show that
-%
-\begin{align*}
-  \LOOCV(\lambda, J)
-  &=
-  \frac{1}{n}
-  \sum_{i=1}^{n}
-  \left( Y_i - \hat\mu_\rd^{-i}(X_i) \right)^2 \\
-  &=
-  \frac{1}{n}
-  \sum_{i=1}^{n}
-  \left(
-    \sum_{r=0}^{J}
-    \omega_r
-    \frac{1}{B}
-    \sum_{b=1}^{B}
-    \frac{1}{1 - 1/N_{r b}(X_i)}
-    \left( Y_i -
-      \sum_{j=1}^{n}
-      \frac{ Y_j \I \left\{ X_j \in T_{r b}(X_i) \right\}}
-      {N_{r b}(X_i)}
-    \right)
-  \right)^{2},
-\end{align*}
-%
-avoiding refitting the model leaving each sample out in turn.
-Supposing $X_i \sim \Unif\big([0,1]^d\big)$ and
-replacing $1/N_{r b}(X_i)$ with their average expectation
-$ \frac{1}{J+1} \sum_{r=0}^{J} \E \left[ 1/N_{r b}(X_i) \right]
-\approx \bar a^d \lambda^d / n$
-where $\bar a^d = \frac{1}{J+1} \sum_{r=0}^{J} a_r^d$
-gives the generalized cross-validation (GCV) formula
-%
-\begin{align}
-  \label{eq:mondrian_gcv}
-  \GCV(\lambda, J)
-  &=
-  \frac{1}{n}
-  \sum_{i=1}^{n}
-  \left(
-    \frac{Y_i - \hat\mu_\rd(X_i)}
-    {1 - \bar a^d \lambda^d / n}
-  \right)^2.
-\end{align}
-%
-The lifetime can then be selected by computing
-either $\hat\lambda_{\LOOCV} \in \argmin_\lambda \LOOCV(\lambda, J)$
-or $\hat\lambda_{\GCV} \in \argmin_\lambda \GCV(\lambda, J)$.
-See Section~\ref{sec:mondrian_weather} for a practical illustration.
-
-\subsection{Choosing the other parameters}
-
-\subsubsection*{The number \texorpdfstring{$B$}{B} of trees in each forest}%
-
-If no debiasing is applied, we suggest
-$B = \sqrt{n}$ to satisfy
-Theorem~\ref{thm:mondrian_confidence}.
-If debiasing is used then we recommend
-$B = n^{\frac{2J-1}{2J}}$, consistent with
-Theorem~\ref{thm:mondrian_confidence_debiased}
-and Theorem~\ref{thm:mondrian_minimax}.
-
-\subsubsection*{The debiasing order \texorpdfstring{$J$}{J}}%
-
-When debiasing a Mondrian random forest, one must decide
-how many orders of bias to remove. This requires some
-oracle knowledge of the H{\"o}lder smoothness of $\mu$ and $f$, which is
-difficult to estimate statistically. As such, we recommend
-removing only the first one or two bias terms, taking $J \in \{0,1,2\}$ to
-avoid overly inflating the variance of the estimator.
-
-\subsubsection*{The debiasing coefficients \texorpdfstring{$a_r$}{ar}}%
-
-As in Section~\ref{sec:mondrian_debiased}, we take $a_r$ to be a fixed
-geometric or arithmetic sequence. For example, one could set
-$a_r = (1+\gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$.
-We suggest taking $a_r = 1.05^r$.
-
-\section{Illustrative example: weather forecasting}%
-\label{sec:mondrian_weather}
-
-To demonstrate our methodology for estimation and inference with Mondrian random
-forests, we consider a simple application
-to a weather forecasting problem. We emphasize that the main aim of this
-section is to provide intuition and understanding for how a Mondrian random
-forest may be used in practice, and we refrain from an in-depth analysis of the
-specific results obtained. Indeed, our assumption of i.i.d.\ data is
-certainly violated with weather data, due to the time-series
-structure of sequential observations.
-Nonetheless, we use data from the \citet{bureau2017daily}, containing daily
-weather information from 2007--2017, at 49 different
-locations across Australia, with $n = 125\,927$ samples.
-
-\begin{figure}[b!]
-  \centering
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_data.png}%
-  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_data_filled_partition.png}%
-  \end{subfigure}
-  \caption[Australian weather forecasting data]{
-    Australian weather forecasting data. Left: colors indicate the response
-    variable of dry (orange) or wet (blue) on the following
-    day. Right: the data is overlaid with a Mondrian random tree,
-    fitted with a lifetime of $\lambda = 5$
-    selected by generalized cross-validation. Cell colors represent the response
-  proportions.}
-  \label{fig:mondrian_weather_data}
-\end{figure}
-
-We consider the classification problem of predicting whether or not it will
-rain on the following day using two covariates: the percentage relative
-humidity, and the pressure in mbar, both at 3pm on the current day. For the
-purpose of framing this as a nonparametric regression problem, we consider
-estimating the probability of rain as the regression function by setting
-$Y_i = 1$ if there is rain on the following day and $Y_i = 0$ otherwise.
-Outliers with pressure less than 985\,mbar or more than 1040\,mbar are removed
-to justify the assertion in Assumption~\ref{ass:mondrian_data} that the density
-of the covariates should be bounded away from zero, and the features are
-linearly scaled to provide normalized samples
-$(X_i, Y_i) \in [0, 1]^2 \times \{0, 1\}$.
-We fit a Mondrian random forest to the data as defined in
-Section~\ref{sec:mondrian_forests}, selecting the lifetime parameter with the
-generalized cross-validation (GCV) method detailed in
-Section~\ref{sec:mondrian_lifetime_selection}.
-
-Figure~\ref{fig:mondrian_weather_data} plots the
-data, using colors to indicate the response values, and illustrates how a
-single Mondrian tree is fitted by sampling from an independent Mondrian process
-and then computing local averages (equivalent to response proportions in this
-special setting with binary outcomes) within each cell. The general pattern of
-rain being predicted by high humidity and low pressure is apparent, with the
-preliminary tree estimator taking the form of a step function on axis-aligned
-rectangles. This illustrates the first-order bias of Mondrian random trees
-discussed in Section~\ref{sec:mondrian_clt}, with the piecewise constant
-estimator providing a poor approximation for the smooth true regression
-function.
-
-\begin{figure}[b!]
-  \centering
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_forest_2.png}%
-  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_forest_design.png}%
-  \end{subfigure}
-  \caption[Fitting Mondrian random forests to the Australian weather data]{
-    Fitting Mondrian random forests to the Australian weather data.
-    Left: with $B=2$ trees, individual cells are clearly visible and the step
-    function persists. Right: with $B=40$ trees, the estimate is much smoother
-    as the individual tree estimates average out.
-  Three design points are identified for further analysis.}
-  \label{fig:mondrian_weather_forest}
-\end{figure}
-
-Figure~\ref{fig:mondrian_weather_forest} adds more trees to the estimator,
-demonstrating the effect of increasing the forest size first to $B=2$
-and then to $B=40$.
-As more trees are included in the Mondrian random forest,
-the regression estimate $\hat \mu(x)$ becomes smoother and therefore also
-enjoys improved bias properties as shown in
-Theorem~\ref{thm:mondrian_bias}, assuming a correct model specification.
-We also choose three specific design points in the
-(humidity, pressure) covariate space,
-namely (20\%, 1020\,mbar), (70\%, 1000\,mbar), and (80\%, 990\,mbar),
-at which to conduct inference
-by constructing confidence intervals. See Table~\ref{tab:mondrian_weather_ci}
-for the results.
-
-\begin{figure}[b!]
-  \centering
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_gcv.png}%
-  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/weather_debiased_forest_design.png}%
-  \end{subfigure}
-  \caption[Cross-validation and debiasing for the Australian weather data]{
-    Left: mean squared error and generalized cross-validation scores
-    for Mondrian random forests with the Australian weather data.
-    Right: a debiased Mondrian random forest with $B=20$, giving $40$ trees
-  in total. Three design points are identified for further analysis.}
-  \label{fig:mondrian_weather_gcv}
-\end{figure}
-
-In Figure~\ref{fig:mondrian_weather_gcv} we show the mean squared error and GCV
-scores
-computed using \eqref{eq:mondrian_gcv} with $B=400$ trees
-for several candidate lifetime parameters $\lambda$. As
-expected, the mean squared error decreases monotonically
-as $\lambda$ increases and the model
-overfits, but the GCV score is minimized at a value which appropriately
-balances the bias and variance; we take $\lambda = 5$.
-We then fit a debiased Mondrian forest
-with bias correction order $J = 1$ as described in
-Section~\ref{sec:mondrian_debiased}, using $B=20$ trees at each debiasing level
-$r \in \{0, 1\}$ for a total of $40$ trees.
-We continue to use the same lifetime parameter
-$\lambda = 5$ selected through GCV without debiasing, following the approach
-recommended in Section~\ref{sec:mondrian_lifetime_selection} to ensure valid
-inference
-through negligible bias.
-The resulting debiased Mondrian random forest estimate is noticeably
-less smooth than the version without bias correction.
-This is expected due to both the inflated variance resulting from the debiasing
-procedure, and the undersmoothing enacted by selecting a lifetime parameter
-using GCV on the original estimator without debiasing.
-
-\begin{table}[t]
-  \centering
-  \begin{tabular}{|c|c|c|c|c|c|c|}
-    \hline
-    \multirow{2}{*}{Point}
-    & \multirow{2}{*}{Humidity}
-    & \multirow{2}{*}{Pressure}
-    & \multicolumn{2}{|c|}{No debiasing, $J=0$}
-    & \multicolumn{2}{|c|}{Debiasing, $J=1$} \\
-    \cline{4-7}
-    & &
-    & $\hat\mu(x)$ & 95\% CI
-    & $\hat\mu(x)$ & 95\% CI \\
-    \hline
-    $1$ & $20\%$ & $1020\,\textrm{mbar}$ &
-    $\phantom{0}4.2\%$ &
-    $3.9\%$ -- $4.5\%$ &
-    $\phantom{0}2.0\%$ &
-    $1.6\%$ -- $2.4\%$ \\
-    $2$ & $70\%$ & $1000\,\textrm{mbar}$ &
-    $52.6\%$ &
-    $51.7\%$ -- $53.6\%$ &
-    $59.8\%$ &
-    $57.8\%$ -- $61.9\%$ \\
-    $3$ & $80\%$ & $\phantom{1}990\,\textrm{mbar}$ &
-    $78.1\%$ &
-    $75.0\%$ -- $81.2\%$ &
-    $93.2\%$ &
-    $86.7\%$ -- $99.6\%$ \\
-    \hline
-  \end{tabular}
-  \caption[Results for the Australian weather data]{
-    Results for the Australian weather data
-  at three specified design points.}
-  \label{tab:mondrian_weather_ci}
-\end{table}
-
-Table~\ref{tab:mondrian_weather_ci} presents numerical results for estimation
-and
-inference at the three specified design points. We first give the outcomes
-without debiasing, using a Mondrian random forest with $B = 400$ trees and
-$\lambda = 5$ selected by GCV. We then show the results with a first-order
-($J=1$) debiased Mondrian random forest using $B = 200$ (again a total of
-$400$ trees) and the same value of $\lambda = 5$. The predicted chance of rain
-$\hat\mu(x)$ is found to vary substantially across different covariate values,
-and the resulting confidence intervals (CI) are generally narrow due to the
-large sample size and moderate lifetime parameter. The forest with debiasing
-exhibits more extreme predictions away from $50\%$ and wider confidence
-intervals in general, in line with the illustration in
-Figure~\ref{fig:mondrian_weather_gcv}. Interestingly, the confidence intervals
-for the
-non-debiased and debiased estimators do not intersect, indicating that the
-original estimator is severely biased, and providing further justification for
-our modified debiased random forest estimator.
-
-\section{Conclusion}%
-\label{sec:mondrian_conclusion}
-
-We gave a central limit theorem for the Mondrian random forest estimator
-and showed how to perform statistical inference on an unknown nonparametric
-regression function. We introduced debiased versions of the Mondrian random
-forest, and demonstrated their advantages
-for statistical inference and minimax-optimal estimation. We discussed
-tuning parameter selection, enabling a fully feasible and practical methodology.
-An application to weather forecasting was presented
-as an illustrative example. Implementations of this chapter's methodology and
-empirical results are provided by a Julia
-package at \github{wgunderwood/MondrianForests.jl}.
-This work is based on \citet{cattaneo2023inference}, and has been
-presented by Underwood at the University of Illinois Statistics Seminar (2024),
-the University of Michigan Statistics Seminar (2024), and the University of
-Pittsburgh Statistics Seminar (2024).
-
-\chapter{Dyadic Kernel Density Estimators}
-\label{ch:kernel}
-
-% abstract
-Dyadic data is often encountered when quantities of interest are associated
-with the edges of a network. As such, it plays an important role in statistics,
-econometrics, and many other data science disciplines. We consider the problem
-of uniformly estimating a dyadic Lebesgue density function, focusing on
-nonparametric kernel-based estimators taking the form of dyadic empirical
-processes. The main contributions of this chapter
-include the minimax-optimal uniform
-convergence rate of the dyadic kernel density estimator, along with strong
-approximation results for the associated standardized and Studentized
-$t$-processes. A consistent variance estimator enables the construction of
-valid and feasible uniform confidence bands for the unknown density function.
-We showcase the broad applicability of our results by developing novel
-counterfactual density estimation and inference methodology for dyadic data,
-which can be used for causal inference and program evaluation. A crucial
-feature of dyadic distributions is that they may be ``degenerate'' at certain
-points in the support of the data, a property making our analysis somewhat
-delicate. Nonetheless our methods for uniform inference remain robust to the
-potential presence of such points. For implementation purposes, we discuss
-inference procedures based on positive semi-definite covariance estimators,
-mean squared error optimal bandwidth selectors, and robust bias correction
-techniques. We illustrate the empirical finite-sample performance of our
-methods both in simulations and with real-world trade data, for which we make
-comparisons between observed and counterfactual trade distributions in
-different years. Our technical results concerning strong approximations and
-maximal inequalities are of potential independent interest.
-
-\section{Introduction}
-\label{sec:kernel_introduction}
-
-Dyadic data, also known as graphon data, plays an important role in the
-statistical, social, behavioral, and biomedical sciences. In network settings,
-this type of dependent data captures interactions between the units of study,
-and its analysis is of interest in statistics \citep{kolaczyk2009statistical},
-economics \citep{graham2020network}, psychology \citep{kenny2020dyadic}, public
-health \citep{luke2007network}, and many other data science disciplines. For
-$n \geq 2$, a dyadic data set contains $\frac{1}{2}n(n-1)$ observed real-valued
-random variables
-%
-\begin{align*}
-  \bW_n = (W_{i j}:1\leq i<j \leq n),
-  \quad\qquad W_{i j}
-  &= W(A_i,A_j,V_{i j}),
-\end{align*}
-%
-where $W$ is an unknown function, $\bA_n=(A_{i}:1\leq i \leq n)$ are
-independent and identically distributed (i.i.d.)\ latent random variables, and
-$\bV_n=(V_{i j}:1\leq i<j \leq n)$ are i.i.d.\ latent random variables
-independent of $\bA_n$. A natural interpretation of this data is as a complete
-undirected network on $n$ vertices, with the latent variable $A_i$ associated
-with node $i$ and the observed variable $W_{i j}$ associated with the edge
-between nodes $i$ and $j$. The data generating process above is justified
-without loss of generality by the celebrated Aldous--Hoover representation
-theorem for exchangeable arrays
-\citep{aldous1981representations,hoover1979relations}.
-
-Various distributional features of dyadic data are of interest in applications.
-Most of the statistical literature focuses on parametric analysis, almost
-exclusively considering moments of (transformations of) the identically
-distributed $W_{i j}$. See \citet{davezies2021exchangeable},
-\citet{gao2021minimax}, and \citet{matsushita2021jackknife} for
-contemporary contributions and overviews. More recently, however, a few
-nonparametric procedures for dyadic data have been proposed in the literature
-\citep{graham2021minimax,graham2024kernel}.
-
-With the aim of estimating functions associated with $W_{i j}$
-using nonparametric kernel methods, we investigate the statistical
-properties of the class of local stochastic processes
-%
-\begin{align}\label{eq:kernel_estimator}
-  w \mapsto \hat{f}_W(w)
-  = \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_h(W_{i j},w),
-\end{align}
-%
-where $k_h(s,w)$ is a kernel function that can change with the $n$-varying
-bandwidth parameter $h=h(n)$ and the evaluation point $w \in \cW\subseteq \R$.
-For each $w\in\cW$ and with an appropriate choice of the kernel function
-(e.g.\ $k_h(s,w)=K((s-w)/h)/h$ for an interior point $w$ of $\cW$ and a
-fixed symmetric integrable kernel function $K$), the statistic $\hat{f}_W(w)$
-becomes a kernel density estimator for the Lebesgue density function
-$f_W(w) = \E\big[f_{W \mid AA}(w \mid A_i,A_j)\big]$, where
-$f_{W \mid AA}(w \mid A_i,A_j)$ denotes the conditional Lebesgue density of
-$W_{i j}$ given $A_i$ and $A_j$. Setting $k_h(s,w)=K((s-w)/h)/h$,
-\citet{graham2024kernel} recently introduced the dyadic point estimator
-$\hat{f}_W(w)$ and studied its large sample properties pointwise in
-$w\in\cW=\mathbb{R}$, while \citet{chiang2020empirical} established its rate of
-convergence uniformly in $w\in\cW$ for a compact interval $\cW$ strictly
-contained in the support of the dyadic data $W_{i j}$.
-\citet{chiang2022inference} obtained a distributional approximation for the
-supremum statistic $\sup_{w\in\cW}\big|\hat{f}_W(w)\big|$ over a finite
-collection $\cW$ of design points. More generally, as we discuss below, the
-estimand $f_W(w)$ is useful in different applications because it forms the
-basis for counterfactual distributional analysis
-(Section~\ref{sec:kernel_counterfactual}) and other nonparametric and
-semiparametric
-methods (Section~\ref{sec:kernel_future}). While we assume throughout
-that the network is complete, our approach generalizes in a straightforward way
-to networks with missing edges, as in Section~\ref{sec:kernel_trade_data}.
-This can be
-seen by setting $W_{i j} = -\infty$ whenever the edge $\{i, j\}$ is not
-present, so that the law of $W_{i j}$ is a mixture between a continuous
-distribution and a point mass at $-\infty$. We apply our methodology to
-recover the continuous component of this distribution, following
-\citet{chiang2022inference}.
-
-We contribute to the emerging literature on nonparametric smoothing methods for
-dyadic data with two main technical results. Firstly, we derive the minimax
-rate of uniform convergence for density estimation with dyadic data, and show
-that the estimator $\hat{f}_W$ in \eqref{eq:kernel_estimator} is
-minimax-optimal under appropriate conditions. Secondly, we present a set of
-uniform distributional approximation results for the \emph{entire} stochastic
-process $\big(\hat{f}_W(w):w\in\cW\big)$. Furthermore, we illustrate the
-usefulness of our main results with two distinct substantive statistical
-applications:
-%
-\begin{inlineroman}
-  \item
-    confidence bands for $f_W$ (Section~\ref{sec:kernel_implementation}), and
-  \item
-    estimation and inference for counterfactual
-    dyadic distributions (Section~\ref{sec:kernel_counterfactual}).
-\end{inlineroman}
-%
-Our main results also lay the foundation for studying the uniform
-distributional properties of other nonparametric and semiparametric tests and
-estimators based on dyadic data (Section~\ref{sec:kernel_future}). Importantly,
-our
-inference results cannot be deduced from the existing U-statistic, empirical
-process and U-process theory available in the literature
-\citep{van1996weak,gine2021mathematical} because, as explained in detail below,
-$\hat{f}_W(w)$ is not a standard U-statistic, nor is
-(a suitable rescaling of) the stochastic process
-$\hat{f}_W$ Donsker in general, and the underlying dyadic data $\bW_n$ exhibits
-statistical dependence due to its network structure.
-
-Section~\ref{sec:kernel_setup} outlines the setup and presents the main
-assumptions imposed throughout this chapter. We demonstrate in
-Theorem~\ref{thm:kernel_bias} how the smoothing bias of the dyadic kernel
-density estimator can be controlled, and then discuss a Hoeffding-type
-decomposition of the U-statistic-like $\hat{f}_W$
-in Lemma~\ref{lem:kernel_hoeffding}. This is more general than
-the standard Hoeffding decomposition for second-order U-statistics due to the
-intrinsic dyadic data structure. In particular, \eqref{eq:kernel_hoeffding}
-shows that $\hat{f}_W(w)$ decomposes into a sum of the four terms $B_n(w)$,
-$L_n(w)$, $E_n(w)$, and $Q_n(w)$, where $E_n(w)$ is not present in the classical
-second-order U-statistic theory. The first term $B_n(w)$ captures the usual
-smoothing bias, the second term $L_n(w)$ is akin to the H{\'a}jek projection
-for second-order U-statistics, the third term $E_n(w)$ is a mean-zero double
-average of conditionally independent terms, and the fourth term $Q_n(w)$ is a
-negligible totally degenerate second-order U-process. The leading stochastic
-fluctuations of the process $\hat{f}_W$ are captured by $L_n$ and $E_n$, both
-of which are known to be asymptotically distributed as Gaussian random
-variables pointwise in $w\in\cW$ \citep{graham2024kernel}. However, the
-H{\'a}jek projection term $L_n$ will often be ``degenerate'' at some or
-possibly all evaluation points $w\in\cW$.
-The three possible types of degeneracy are detailed in
-Lemma~\ref{lem:kernel_trichotomy},
-and we establish bounds in probability for each term in the Hoeffding-type
-decomposition in Lemma~\ref{lem:kernel_uniform_concentration}.
-We give an example of a simple family of dyadic distributions
-exhibiting all three degeneracy types.
-
-Section~\ref{sec:kernel_point_estimation} studies minimax convergence rates for
-point
-estimation of $f_W$ uniformly over $\cW$ and gives precise conditions under
-which the estimator $\hat{f}_W$ is minimax-optimal. Firstly, in
-Theorem~\ref{thm:kernel_uniform_consistency} we establish the uniform rate of
-convergence of $\hat{f}_W$ for $f_W$. This result improves upon the recent
-paper of \citet{chiang2020empirical} by allowing for compactly supported dyadic
-data and generic kernel-like functions $k_h$ (including boundary-adaptive
-kernels), while also explicitly accounting for possible degeneracy of the
-H\'{a}jek projection term $L_n$ at some or possibly all points $w\in\cW$.
-Secondly, in Theorem~\ref{thm:kernel_minimax} we derive the minimax uniform
-convergence rate for estimating $f_W$, again allowing for possible degeneracy,
-and verify that it is achieved by $\hat f_W$. This result appears to be new to
-the literature, complementing recent work on parametric moment estimation using
-graphon data \citep{gao2021minimax} and on nonparametric kernel-based
-regression using dyadic data \citep{graham2021minimax}.
-
-Section~\ref{sec:kernel_inference} presents a distributional analysis of the
-stochastic process $\hat{f}_W$ uniformly in $w \in \cW$. Because the
-$t$-process based on $\hat{f}_W$ is
-not asymptotically tight in general, it does not converge weakly in the space
-of uniformly bounded real functions supported on $\cW$ and equipped with the
-uniform norm \citep{van1996weak}, and hence is non-Donsker. To circumvent this
-problem, we employ strong approximation methods to characterize its
-distributional properties. Up to the smoothing bias term $B_n$ and the
-negligible term $Q_n$, it suffices to consider the stochastic process
-$w \mapsto L_n(w)+E_n(w)$. Since $L_n$ can be degenerate at some or possibly all
-points $w\in\cW$, and also because under some bandwidth choices both $L_n$ and
-$E_n$ can be of comparable order, it is crucial to analyze the joint
-distributional properties of $L_n$ and $E_n$. To do so, we employ a carefully
-crafted conditioning approach where we first establish an unconditional strong
-approximation for $L_n$ and a conditional-on-$\bA_n$ strong approximation for
-$E_n$. We then combine these to obtain a strong approximation for $L_n+E_n$.
-
-The stochastic process $L_n$ is an empirical process indexed by an $n$-varying
-class of functions depending only on the i.i.d.\ random variables $\bA_n$. Thus
-we use the celebrated Hungarian construction \citep{komlos1975approximation},
-building on ideas in \citet{gine2004kernel} and \citet{gine2010confidence}. The
-resulting rate of strong approximation is optimal, and follows from a generic
-strong approximation result of potential independent interest given in
-Section~\ref{sec:kernel_app_technical}. Our main result for $L_n$ is given as
-Lemma~\ref{lem:kernel_strong_approx_Ln}, and makes explicit the potential
-presence of
-degenerate points.
-
-The stochastic process $E_n$ is an empirical process depending on the dyadic
-variables $W_{i j}$ and indexed by an $n$-varying class of functions. When
-conditioning on $\bA_n$, the variables $W_{i j}$ are independent but not
-necessarily identically distributed (i.n.i.d.), and thus we establish a
-conditional-on-$\bA_n$ strong approximation for $E_n$ based on the Yurinskii
-coupling \citep{yurinskii1978error}, leveraging a refinement obtained by
-\citet*[Lemma~38]{belloni2019conditional}. This result follows from a generic
-strong approximation result which gives a novel rate of strong approximation
-for (local) empirical processes based on i.n.i.d. data, given in
-Section~\ref{sec:kernel_app_technical}.
-Lemma~\ref{lem:kernel_conditional_strong_approx_En} gives our conditional strong
-approximation for $E_n$.
-
-Once the unconditional strong approximation for $L_n$ and the
-conditional-on-$\bA_n$ strong approximation for $E_n$ are established, we show
-how to properly ``glue'' them together to deduce a final unconditional strong
-approximation for $L_n+E_n$ and hence also for $\hat{f}_W$ and its associated
-$t$-process. This final step requires some additional technical work. Firstly,
-building on our conditional strong approximation for $E_n$, we establish an
-unconditional strong approximation for $E_n$ in
-Lemma~\ref{lem:kernel_unconditional_strong_approx_En}.
-We then employ a generalization
-of the celebrated Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform},
-given in given in Section~\ref{sec:kernel_app_technical}, to deduce a
-\emph{joint}
-strong approximation for $(L_n,E_n)$ and, in particular, for $L_n+E_n$. Thus we
-obtain our main result in Theorem~\ref{thm:kernel_strong_approx_Tn},
-which establishes
-a valid strong approximation for the $t$-process associated with $\hat{f}_W$.
-This uniform inference result complements the recent contribution of
-\citet{davezies2021exchangeable}, which is not applicable here as
-the $t$-process is non-Donsker.
-
-We illustrate the applicability of our strong approximation results for
-$\hat{f}_W$ and its associated $t$-process by constructing valid standardized
-uniform confidence bands for the unknown density function $f_W$
-in Theorem~\ref{thm:kernel_infeasible_ucb}. Instead of
-relying on extreme value theory \citep*[as in][]{gine2004kernel}, we employ
-anti-concentration methods, following \citet{chernozhukov2014anti}. This
-illustration improves on the recent work of \citet{chiang2022inference}, which
-obtained simultaneous confidence intervals for the dyadic density $f_W$ based
-on a high-dimensional central limit theorem over rectangles, following prior
-work by \citet{chernozhukov2017central}. The distributional
-approximation therein is applied to the H\'{a}jek projection term $L_n$ only,
-whereas our main construction leading to
-Theorem~\ref{thm:kernel_strong_approx_Tn}
-gives a strong approximation for the entire U-process-like $\hat{f}_W$ and its
-associated $t$-process, uniformly on $\cW$. As a consequence, our uniform
-inference theory is robust to potential unknown degeneracies in $L_n$ by virtue
-of our strong approximation for $L_n+E_n$ and the use of proper standardization,
-delivering a ``rate-adaptive'' inference procedure. Our result appears to be
-the first to provide confidence bands that are valid uniformly over $w \in \cW$
-rather than merely over a finite collection of design points. Moreover, they
-provide distributional approximations for the whole $t$-statistic process,
-which can be useful in applications where functionals other than the supremum
-are of interest.
-
-Section~\ref{sec:kernel_implementation} addresses outstanding issues of
-implementation. Firstly, we discuss estimation of the covariance function of
-the Gaussian process underlying our strong approximation results. We present
-two estimators, one based on a plug-in method, and the other on a
-positive semi-definite regularization thereof \citep{laurent2005semidefinite}.
-We derive the uniform convergence rates for both estimators in
-Lemma~\ref{lem:kernel_sdp}, which we then use to justify Studentization
-of $\hat{f}_W$
-and a feasible simulation-based approximation of the infeasible Gaussian
-process underlying our strong approximation results. Secondly, we discuss
-integrated mean squared error (IMSE) bandwidth selection and provide a simple
-rule-of-thumb implementation for applications
-\citep{wand1994kernel,simonoff1996smoothing}. Thirdly, we provide feasible,
-valid uniform inference methods for $f_W$ by employing robust bias correction
-\citep{calonico2018effect, calonico2022coverage}.
-Algorithm~\ref{alg:kernel_method}
-summarizes our entire feasible uniform inference methodology.
-
-Section~\ref{sec:kernel_simulations} reports empirical evidence for our proposed
-feasible robust bias-corrected confidence bands for $f_W$. We use simulations
-to show that these confidence bands are robust to potential unknown degenerate
-points in the underlying dyadic distribution.
-
-Section~\ref{sec:kernel_counterfactual} presents novel results for
-counterfactual
-dyadic density estimation and inference, offering an application of our general
-theory to a substantive problem in statistics and other data science
-disciplines. Counterfactual distributions are important for causal inference
-and policy evaluation
-\citep{dinardo1996distribution,chernozhukov2013inference}, and in the context
-of network data, such analysis can be used to answer empirical questions such
-as ``what would the international trade distribution have been if
-the gross domestic product (GDP) of the countries had remained the same as in a
-previous year?'' We formally show how our theory for kernel-based dyadic
-estimators can be used to infer the counterfactual density function of dyadic
-data had some monadic covariates followed a different distribution. We propose
-a two-step semiparametric reweighting approach in which we first estimate the
-Radon--Nikodym derivative between the observed and counterfactual covariate
-distributions using a simple parametric estimator, and then use this to
-construct a weighted dyadic kernel density estimator. We present uniform
-consistency, strong approximation, and feasible inference results for this
-dyadic counterfactual density estimator. Finally, we illustrate our
-methods with a real dyadic data set recording bilateral trade between
-countries from 1995 to 2005, using GDP as a covariate for the
-counterfactual analysis.
-
-Section~\ref{sec:kernel_future} discusses further statistical applications
-of our main
-results, including dyadic density testing and nonparametric and
-semiparametric dyadic regression. Section~\ref{sec:kernel_conclusion} concludes.
-Appendix~\ref{app:kernel} includes other technical and methodological results,
-proofs, and additional details omitted here to conserve space.
-Section~\ref{sec:kernel_app_technical} may be of independent interest,
-containing
-two generic strong approximation theorems for empirical processes, a
-generalized Vorob'ev--Berkes--Philipp theorem, and a maximal inequality for
-i.n.i.d.\ random variables.
-
-\subsection{Notation}
-
-The total variation norm of a
-real-valued function $g$ of a single real variable is written as
-$\|g\|_\TV = \sup_{n \geq 1} \sup_{x_1 \leq \cdots \leq x_n}
-\sum_{i=1}^{n-1} |g(x_{i+1}) - g(x_i)|$.
-For an integer $m\geq 0$, denote by $\mathcal{C}^m(\mathcal{X})$
-the space of all functions from $\R$ to $\R$
-which are $m$ times continuously differentiable on
-a subset $\mathcal{X} \subseteq \R$.
-For $C>0$, define the H\"{o}lder class with smoothness parameter
-$\beta > 0$ to be
-$\cH^\beta_C(\cX) =
-\big\{
-  g \in \cC^{\flbeta}(\cX) \! : \!
-  \max_{1 \leq r \leq \flbeta}
-  \big| g^{(r)}(x) \big| \leq C,
-  \big| g^{(\flbeta)}(x) - g^{(\flbeta)}(x') \big|
-  \leq C |x-x'|^{\beta - \flbeta},
-  \forall x, x' \in \cX
-\big\}$,
-where $\flbeta$ denotes the largest integer which is strictly less than $\beta$.
-Note that $\cH^1_C(\cX)$ is the class of $C$-Lipschitz functions on $\cX$.
-For $a \in \R$ and $b \geq 0$, we write $[a \pm b]$ for the interval
-$[a-b, a+b]$. For non-negative sequences $a_n$ and $b_n$, write
-$a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that
-$a_n / b_n$ is bounded for $n\geq 1$.
-Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$.
-If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$.
-For random non-negative sequences $A_n$ and $B_n$, write
-$A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if
-$A_n / B_n$ is bounded in probability.
-Write $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability.
-For $a,b \in \R$, define $a\wedge b=\min\{a,b\}$ and $a \vee b = \max\{a,b\}$.
-
-\section{Setup}\label{sec:kernel_setup}
-
-We impose the following two assumptions throughout this chapter,
-which concern firstly the dyadic data generating process, and
-secondly the choice of kernel and bandwidth sequence.
-
-%
-\begin{assumption}[Data generation]
-  \label{ass:kernel_data}
-  %
-  % A and V variables
-  Let $\bA_n = (A_i: 1 \leq i \leq n)$ be i.i.d.\ random variables supported on
-  $\cA \subseteq \R$ and let $\bV_n = (V_{i j}: 1 \leq i < j \leq n)$ be
-  i.i.d.\ random variables with a Lebesgue density $f_V$ on $\R$, with $\bA_n$
-  independent of $\bV_n$.
-  %
-  % W variables
-  Let $W_{i j} = W(A_i, A_j, V_{i j})$ and
-  $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$, where $W$ is an unknown real-valued
-  function which is symmetric in its first two arguments.
-  %
-  Let $\cW \subseteq \R$ be a compact interval with positive Lebesgue measure
-  $\Leb(\cW)$. The conditional distribution of $W_{i j}$ given $A_i$ and $A_j$
-  admits a Lebesgue density $f_{W \mid AA}(w \mid A_i, A_j)$.
-  For $C_\rH > 0$ and $\beta \geq 1$, take $f_W \in \cH^\beta_{C_\rH}(\cW)$
-  where $f_{W}(w) = \E\left[f_{W \mid AA}(w \mid A_i,A_j)\right]$ and
-  $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$
-  for all $a,a' \in \cA$. Suppose
-  $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV <\infty$ where
-  $f_{W \mid A}(w \mid a) = \E\left[f_{W \mid AA}(w \mid A_i,a)\right]$.
-  %
-\end{assumption}
-
-In Assumption~\ref{ass:kernel_data} we require the density $f_W$ be in a
-$\beta$-smooth H\"older class of functions on the compact interval $\cW$.
-H\"older classes are well established in the minimax estimation literature
-\citep{stone1982optimal,gine2021mathematical},
-with the smoothness parameter $\beta$ appearing
-in the minimax-optimal rate of convergence. If the H\"older condition is
-satisfied only piecewise, then our results remain valid provided that the
-boundaries between the pieces are known and treated as boundary points.
-
-If $W(a_1, a_2, v)$ is strictly monotonic and continuously differentiable in
-its third argument, we can give the conditional density of $W_{i j}$ explicitly
-using the usual change-of-variables formula: with $w=W(a_1,a_2,v)$, we have
-$f_{W \mid AA}(w \mid a_1,a_2)
-= f_V(v) \big|\partial W(a_1,a_2,v)/\partial v\big|^{-1}$.
-
-\begin{assumption}[Kernels and bandwidth]
-  \label{ass:kernel_bandwidth}%
-  %
-  Let $h = h(n) > 0$ be a sequence of bandwidths satisfying $h \log n \to 0$
-  and $\frac{\log n}{n^2h} \to 0$. For each $w \in \cW$, let $k_h(\cdot, w)$ be
-  a real-valued function supported on $[w \pm h] \cap \cW$. For an integer
-  $p \geq 1$, let $k_h$ belong to a family of boundary bias-corrected kernels
-  of order $p$, i.e.,
-  %
-  \begin{align*}
-    \int_{\cW}
-    (s-w)^r k_h(s,w) \diff{s}
-    \quad
-    \begin{cases}
-      \begin{alignedat}{2}
-        &= 1 &\qquad &\text{for all } w \in \cW \text{ if }\, r = 0, \\
-        &= 0 & &\text{for all } w \in \cW \text{ if }\, 1 \leq r \leq p-1, \\
-        &\neq 0 & &\text{for some } w \in \cW \text{ if }\, r = p.
-      \end{alignedat}
-    \end{cases}
-  \end{align*}
-  %
-  Also, for $C_\rL > 0$,
-  suppose $k_h(s, \cdot) \in \cH^1_{C_\rL h^{-2}}(\cW)$
-  for all $s \in \cW$.
-  %
-\end{assumption}
-
-This assumption allows for all standard compactly supported and possibly
-boundary-corrected kernel functions
-\citep{wand1994kernel,simonoff1996smoothing}, constructed for example by taking
-polynomials on a compact interval and solving a linear system for the
-coefficients. Assumption~\ref{ass:kernel_bandwidth} implies
-(see Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
-in Appendix~\ref{app:kernel})
-that if $h \leq 1$ then $k_h$ is uniformly bounded by
-$C_\rk h^{-1}$ where $C_\rk \vcentcolon = 2 C_\rL + 1 + 1/\Leb(\cW)$.
-
-\subsection{Bias characterization}
-\label{sec:kernel_bias}
-
-We begin by characterizing and bounding the bias
-$B_n(w) = \E \big[ \hat f_W(w) \big] - f_W(w)$.
-Theorem~\ref{thm:kernel_bias} is a standard result for the non-random smoothing
-bias in kernel density estimation with higher-order kernels and boundary bias
-correction, and does not rely on the dyadic structure.
-
-\begin{theorem}[Bias bound]
-  \label{thm:kernel_bias}
-
-  Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
-  hold. For $w \in \cW$ define the leading bias term as
-  %
-  \begin{align*}
-    b_p(w)
-    &=
-    \frac{f_W^{(p)}(w)}{p!}
-    \int_{\cW}
-    k_h(s,w)
-    \left(
-      \frac{s-w}{h}
-    \right)^p
-    \diff{s}.
-  \end{align*}
-  %
-  for $1 \leq p \leq \flbeta$. Then we have the following bias bounds.
-  %
-  \begin{enumerate}[label=(\roman*)]
-    \item If $p \leq \flbeta - 1$, then
-      $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) |
-      \leq \frac{2 C_\rk C_\rH}{(p+1)!} h^{p+1}$.
-
-    \item If $p = \flbeta$, then
-      $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) |
-      \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$.
-
-    \item If $p \geq \flbeta+1$, then
-      $\sup_{w \in \cW} | B_n(w) |
-      \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$.
-  \end{enumerate}
-  %
-  Noting that $\sup_{\cW} |b_p(w)| \leq 2 C_\rk C_\rH / p!$,
-  we deduce that for $h \leq 1$,
-  %
-  \begin{align*}
-    \sup_{w \in \cW} | B_n(w) |
-    \leq
-    \frac{4 C_\rk C_\rH}{(p \wedge \flbeta)!}
-    h^{p \wedge \beta}
-    \lesssim
-    h^{p \wedge \beta}.
-  \end{align*}
-
-\end{theorem}
-
-\subsection{Hoeffding-type decomposition and degeneracy}
-\label{sec:kernel_degeneracy}
-
-Our next step is to consider the stochastic part
-$\hat f_W(w) - \E \big[ \hat f_W(w) \big]$
-of the classical bias--variance decomposition. This term is akin to a
-U-statistic and thus admits a Hoeffding-type decomposition, presented in
-Lemma~\ref{lem:kernel_hoeffding}, which is a key element in our analysis.
-
-\begin{lemma}[Hoeffding-type decomposition for $\hat f_W$]
-  \label{lem:kernel_hoeffding}
-
-  Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
-  hold. Define the linear, quadratic, and error terms
-  %
-  \begin{align*}
-    L_n(w)
-    &=
-    \frac{2}{n} \sum_{i=1}^n l_i(w),
-    &Q_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} q_{i j}(w), \\
-    E_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} e_{i j}(w)
-  \end{align*}
-  %
-  respectively, where
-  %
-  \begin{align*}
-    l_i(w)
-    &=
-    \E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w)\right], \\
-    q_{i j}(w)
-    &=
-    \E\left[k_h(W_{i j},w) \mid A_i, A_j\right]
-    - \E\left[k_h(W_{i j},w) \mid A_i\right]
-    - \E\left[k_h(W_{i j},w) \mid A_j\right]
-    + \E\left[k_h(W_{i j},w)\right], \\
-    e_{i j}(w)
-    &=
-    k_h(W_{i j},w) - \E\left[k_h(W_{i j},w) \mid A_i, A_j\right].
-  \end{align*}
-  %
-  Then, recalling the bias term $B_n$ from Section~\ref{sec:kernel_bias},
-  we have the Hoeffding-type decomposition
-  %
-  \begin{align}
-    \label{eq:kernel_hoeffding}
-    \hat f_W(w) - f_W(w) = L_n(w) + Q_n(w) + E_n(w) + B_n(w).
-  \end{align}
-  %
-  The processes $L_n$, $Q_n$, and $E_n$ are mean-zero
-  with $\E\big[L_n(w)\big] = \E\big[Q_n(w)\big] = \E\big[E_n(w)\big] = 0$
-  for all $w \in \cW$. They are also orthogonal,
-  satisfying $\E\big[ L_n(w) Q_n(w') \big] = \E\big[ L_n(w) E_n(w') \big]
-  = \E\big[ Q_n(w) E_n(w') \big] = 0$ for all $w, w' \in \cW$.
-  %
-\end{lemma}
-
-The process $L_n$ is the H{\'a}jek projection of a U-process,
-which can exhibit degeneracy if $\Var[L_n(w)] = 0$ at some
-or all points $w \in \cW$. To characterize the different possible
-degeneracy types in Lemma~\ref{lem:kernel_trichotomy},
-we first introduce the following lower and upper degeneracy constants:
-%
-\begin{align*}
-  \Dl^2 := \inf_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right]
-  \qquad \text{ and } \qquad
-  \Du^2 := \sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right].
-\end{align*}
-%
-\begin{lemma}[Trichotomy of degeneracy]%
-  \label{lem:kernel_trichotomy}%
-  %
-  Grant Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}.
-  Then the type of degeneracy exhibited by $\hat f_W(w)$
-  is precisely one of the following three possibilities.
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item Total degeneracy:
-      $\Du = \Dl = 0$. Then $L_n(w) = 0$ for all $w \in \cW$ almost surely.
-
-    \item No degeneracy:
-      $\Dl > 0$. Then $\inf_{w \in \cW} \Var[L_n(w)] \geq \frac{2 \Dl}{n}$
-      for all large enough n.
-
-    \item Partial degeneracy:
-      $\Du > \Dl = 0$. There exists $w \in \cW$ with
-      $\Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$;
-      such a point is labeled \emph{degenerate} and satisfies
-      $\Var[L_n(w)] \leq 64 C_\rk C_\rH C_\rd \frac{h}{n}$.
-      There is also a point $w' \in \cW$ with
-      $\Var\left[f_{W \mid A}(w' \mid A_i)\right] > 0$;
-      such a point is labeled \emph{non-degenerate} and satisfies
-      $\Var[L_n(w')] \geq
-      \frac{2}{n} \Var\left[f_{W \mid A}(w' \mid A_i)\right]$
-      for all large enough $n$.
-
-  \end{enumerate}
-
-\end{lemma}
-
-The following lemma describes the uniform stochastic order of the different
-terms in the Hoeffding-type decomposition, explicitly accounting for potential
-degeneracy.
-
-\begin{lemma}[Uniform concentration]
-  \label{lem:kernel_uniform_concentration}
-
-  Suppose Assumptions \ref{ass:kernel_data} and
-  \ref{ass:kernel_bandwidth} hold. Then
-  %
-  \begin{align*}
-    \E\left[ \sup_{w \in \cW} |L_n(w)| \right]
-    &\lesssim \frac{\Du}{\sqrt n},
-    &\E\left[ \sup_{w \in \cW} |Q_n(w)| \right]
-    &\lesssim \frac{1}{n},
-    &\E\left[ \sup_{w \in \cW} |E_n(w)| \right]
-    &\lesssim \sqrt{\frac{\log n}{n^2h}}.
-  \end{align*}
-\end{lemma}
-
-Lemma~\ref{lem:kernel_uniform_concentration} captures the potential total
-degeneracy
-of $L_n$ by illustrating how if $\Du=0$ then $L_n=0$ everywhere on $\cW$ almost
-surely. The following lemma captures the potential partial degeneracy of $L_n$,
-where $\Du > \Dl = 0$. For $w,w' \in \cW$, define the covariance function
-%
-\begin{align*}
-  \Sigma_n(w,w') =
-  \E\Big[
-    \Big(
-      \hat f_W(w)
-      - \E\big[\hat f_W(w)\big]
-    \Big)
-    \Big(
-      \hat f_W(w')
-      - \E\big[\hat f_W(w')\big]
-    \Big)
-  \Big].
-\end{align*}
-%
-\begin{lemma}[Variance bounds]
-  \label{lem:kernel_variance_bounds}
-  Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
-  hold. Then for sufficiently large $n$,
-  %
-  \begin{align*}
-    \frac{\Dl^2}{n} + \frac{1}{n^2h}
-    \inf_{w \in \cW} f_W(w)
-    &\lesssim
-    \inf_{w \in \cW} \Sigma_n(w,w)
-    \leq
-    \sup_{w \in \cW} \Sigma_n(w,w)
-    \lesssim
-    \frac{\Du^2}{n} + \frac{1}{n^2h}.
-  \end{align*}
-  %
-\end{lemma}
-
-As a simple example of the different types of degeneracy, consider the family
-of dyadic distributions $\P_{\pi}$ indexed by $\pi = (\pi_1, \pi_2, \pi_3)$
-with $\sum_{i=1}^3 \pi_i = 1$ and $\pi_i \geq 0$, generated by
-$W_{i j} = A_i A_j + V_{i j}$, where $A_i$ equals $-1$ with probability
-$\pi_1$, equals $0$ with probability $\pi_2$ and equals $+1$ with probability
-$\pi_3$, and $V_{i j}$ is standard Gaussian. This model induces a latent
-``community structure'' where community membership is determined by the value
-of $A_i$ for each node $i$, and the interaction outcome $W_{i j}$ is a function
-only of the communities which $i$ and $j$ belong to and some idiosyncratic
-noise. Unlike the stochastic block model \citep{kolaczyk2009statistical}, our
-setup assumes that community membership has no impact on edge existence, as we
-work with fully connected networks; see Section~\ref{sec:kernel_trade_data} for
-a
-discussion of how to handle missing edges in practice. Also note that the
-parameter of interest in this chapter is the Lebesgue density of a continuous
-random variable $W_{i j}$ rather than the probability of network edge
-existence, which is the focus of the graphon estimation literature
-\citep{gao2021minimax}.
-
-In line with Assumption~\ref{ass:kernel_data}, $\bA_n$ and $\bV_n$ are i.i.d.\
-sequences independent of each other. Then
-$f_{W \mid AA}(w \mid A_i, A_j) = \phi(w - A_i A_j)$,\,
-$f_{W \mid A}(w \mid A_i) = \pi_1 \phi(w + A_i) + \pi_2 \phi(w)
-+ \pi_3 \phi(w - A_i)$, and
-$f_W(w) = (\pi_1^2 + \pi_3^2) \phi(w-1) + \pi_2 (2 - \pi_2) \phi(w) + 2
-\pi_1 \pi_3 \phi(w+1),$
-where $\phi$ denotes the probability density function of the standard normal
-distribution. Note that $f_W(w)$ is strictly positive for all $w \in \R$.
-Consider the parameter choices:
-%
-\begin{enumerate}[label=(\roman*)]
-
-  \item $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$:\quad
-    $\P_\pi$ is degenerate at all $w \in \R$,
-
-  \item $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$:\quad
-    $\P_\pi$ is degenerate only at $w=0$,
-
-  \item $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$:\quad
-    $\P_\pi$ is non-degenerate for all $w \in \R$.
-
-\end{enumerate}
-%
-Figure~\ref{fig:kernel_distribution} demonstrates these phenomena, plotting the
-density $f_W$ and the standard deviation of the conditional
-density $f_{W|A}$ over $\cW = [-2,2]$ for each choice of the parameter $\pi$.
-
-The trichotomy of total/partial/no degeneracy is useful for understanding the
-distributional properties of the dyadic kernel density estimator
-$\hat{f}_W(w)$. Crucially, our need for uniformity in $w$ complicates the
-simpler degeneracy/no degeneracy dichotomy observed previously in the
-literature \citep{graham2024kernel}. From a pointwise-in-$w$
-perspective, partial degeneracy causes no issues, while it is a fundamental
-problem when conducting inference uniformly over $w \in \cW$. We develop
-methods that are valid regardless of the presence of partial or total
-degeneracy.
-
-\begin{figure}[t]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/distribution_plot_total.pdf}
-    \caption{Total degeneracy, \\
-    $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/distribution_plot_partial.pdf}
-    \caption{Partial degeneracy, \\
-    $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/distribution_plot_none.pdf}
-    \caption{No degeneracy, \\
-    $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.}
-  \end{subfigure}
-  %
-  \caption[The family of distributions $\P_\pi$]{
-    Density $f_W$ and standard deviation
-  of $f_{W|A}$ for the family of distributions $\P_\pi$.}
-  %
-  \label{fig:kernel_distribution}
-\end{figure}
-
-\section{Point estimation results}
-\label{sec:kernel_point_estimation}
-
-Using the bias bound from Theorem~\ref{thm:kernel_bias} and
-the concentration results from Lemma~\ref{lem:kernel_uniform_concentration},
-the next theorem establishes an upper bound on the uniform convergence rate of
-$\hat f_W$.
-%
-\begin{theorem}[Uniform convergence rate]%
-  \label{thm:kernel_uniform_consistency}%
-  Suppose that Assumptions \ref{ass:kernel_data} and
-  \ref{ass:kernel_bandwidth} hold. Then
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big|\hat{f}_W(w) - f_W(w)\big|
-    \right]
-    \lesssim
-    h^{p\wedge\beta} + \frac{\Du}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}.
-  \end{align*}
-\end{theorem}
-%
-The implicit constant in Theorem~\ref{thm:kernel_uniform_consistency} depends
-only on
-$\cW$, $\beta$, $C_\rH$, and the choice of kernel. We interpret this result in
-light of the degeneracy trichotomy from Lemma~\ref{lem:kernel_trichotomy}.
-These results generalize \citet*[Theorem~1]{chiang2020empirical}
-by allowing for compactly supported data and more general kernels
-$k_h(\cdot,w)$, enabling boundary-adaptive estimation.
-
-%
-\begin{enumerate}[label=(\roman*)]
-  \item Partial or no degeneracy: $\Du > 0$.
-    Any bandwidths satisfying
-    $n^{-1} \log n \lesssim h \lesssim n^{-\frac{1}{2(p\wedge\beta)}}$ yield
-    $\E\big[\sup_{w \in \cW}\big|\hat f_W(w)
-    - f_W(w)\big| \big] \lesssim \frac{1}{\sqrt n}$, the ``parametric''
-    bandwidth-independent rate noted by \citet{graham2024kernel}.
-
-  \item Total degeneracy: $\Du = 0$.
-    Minimizing the bound in Theorem~\ref{thm:kernel_uniform_consistency} with
-    $h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2(p\wedge\beta)+1}}$
-    yields $\E\big[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \big]
-    \lesssim
-    \big(\frac{\log n}{n^2} \big)^{\frac{p\wedge\beta}{2(p\wedge\beta)+1}}$.
-\end{enumerate}
-
-\subsection{Minimax optimality}
-
-We establish the minimax rate under the supremum norm for density estimation
-with dyadic data. This implies minimax optimality of the kernel density
-estimator $\hat f_W$, regardless of the degeneracy type of the dyadic
-distribution.
-
-\begin{theorem}[Uniform minimax optimality]
-  \label{thm:kernel_minimax}
-
-  Fix $\beta \geq 1$ and $C_\rH > 0$, and take $\cW$ a compact interval with
-  positive Lebesgue measure. Define $\cP = \cP(\cW, \beta, C_\rH)$ as the class
-  of dyadic distributions satisfying Assumption~\ref{ass:kernel_data}. Define
-  $\cP_\rd$ as the subclass of $\cP$ containing only those distributions which
-  are totally degenerate on $\cW$ in the sense that
-  $\sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$. Then
-  %
-  \begin{align*}
-    \inf_{\tilde f_W}
-    \sup_{\P \in \cP}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big| \tilde f_W(w) - f_W(w) \big|
-    \right]
-    &\asymp
-    \frac{1}{\sqrt n}, \\
-    \inf_{\tilde f_W}
-    \sup_{\P \in \cP_\rd}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big| \tilde f_W(w) - f_W(w) \big|
-    \right]
-    &\asymp
-    \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}},
-  \end{align*}
-  %
-  where $\tilde f_W$ is any estimator depending only on the data
-  $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$ distributed according to the dyadic
-  law $\P$. The constants in $\asymp$ depend only on
-  $\cW$, $\beta$, and $C_\rH$.
-
-\end{theorem}
-
-Theorem~\ref{thm:kernel_minimax} shows that the uniform convergence rate of
-$n^{-1/2}$ obtained in Theorem~\ref{thm:kernel_uniform_consistency}
-(coming from the $L_n$ term) is minimax-optimal in general.
-When attention is restricted to totally degenerate dyadic distributions,
-$\hat f_W$ also achieves the minimax rate of uniform convergence
-(assuming a kernel of sufficiently high order $p \geq \beta$),
-which is on the order of
-$\left(\frac{\log n}{n^2}\right)^{\frac{\beta}{2\beta+1}}$ and
-is determined by the bias $B_n$ and the leading variance term $E_n$ in
-\eqref{eq:kernel_hoeffding}.
-
-Combining Theorems
-\ref{thm:kernel_uniform_consistency}~and~\ref{thm:kernel_minimax},
-we conclude that $\hat{f}_W(w)$ achieves the minimax-optimal rate for uniformly
-estimating $f_W(w)$ if $h \asymp \left( \frac{\log n}{n^2}
-\right)^{\frac{1}{2\beta+1}}$ and a kernel of sufficiently high order
-($p \geq \beta$) is used, whether or not there are any degenerate points in the
-underlying data generating process. This result appears to be new to the
-literature on nonparametric estimation with dyadic data. See
-\citet{gao2021minimax} for a contemporaneous review.
-
-\section{Distributional results}
-\label{sec:kernel_inference}
-
-We investigate the distributional properties of the
-standardized $t$-statistic process
-%
-\begin{align*}
-  T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\Sigma_n(w,w)}},
-\end{align*}
-%
-which is not necessarily asymptotically tight. Therefore, to approximate the
-distribution of the entire $t$-statistic process, as well as specific
-functionals thereof, we rely on a novel strong approximation approach outlined
-in this section. Our results can be used to perform valid uniform inference
-irrespective of the degeneracy type.
-
-This section is largely concerned with distributional properties and thus
-frequently requires copies of stochastic processes. For succinctness of
-notation, we will not differentiate between a process and its copy, but details
-are available in Section~\ref{sec:kernel_app_technical}.
-
-\subsection{Strong approximation}
-
-By the Hoeffding-type decomposition \eqref{eq:kernel_hoeffding} and
-Lemma~\ref{lem:kernel_uniform_concentration}, it suffices to consider the
-distributional properties of the stochastic process $L_n + E_n$.
-Our approach combines the K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation
-\citep{komlos1975approximation} to obtain a strong approximation of $L_n$ with
-a Yurinskii approximation \citep{yurinskii1978error} to obtain a
-\emph{conditional} (on $\bA_n$) strong approximation of $E_n$. The latter is
-necessary because $E_n$ is akin to a local empirical process of i.n.i.d.\
-random variables, conditional on $\bA_n$, and therefore the KMT approximation
-is not applicable. These approximations are then combined to give a final
-(unconditional) strong approximation for $L_n+E_n$, and thus for the
-$t$-statistic process $T_n$.
-
-The following lemma is an application of our generic KMT approximation result
-for empirical processes, given in Section~\ref{sec:kernel_app_technical}, which
-builds on earlier work by \citet{gine2004kernel} and \citet{gine2010confidence}
-and may be of independent interest.
-
-\begin{lemma}[Strong approximation of $L_n$]
-  \label{lem:kernel_strong_approx_Ln}
-  %
-  Suppose that Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth}
-  hold. For each $n$ there exists a mean-zero Gaussian process $Z^L_n$ indexed
-  on $\cW$ satisfying
-  $\E\big[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w) \big| \big]
-  \lesssim \frac{\Du \log n}{\sqrt{n}}$, where
-  $\E[Z_n^L(w)Z_n^L(w')] = n\E[L_n(w)L_n(w')]$ for all $w, w' \in \cW$. The
-  process $Z_n^L$ is a function only of $\bA_n$ and some random noise
-  independent of $(\bA_n, \bV_n)$.
-\end{lemma}
-
-% donsker case
-The strong approximation result in Lemma~\ref{lem:kernel_strong_approx_Ln}
-would be
-sufficient to develop valid and even optimal uniform inference procedures
-whenever both $\Dl > 0$ (no degeneracy in $L_n$) and $n h \gg \log n$
-($L_n$ is leading). In this special case, the recent Donsker-type results of
-\citet{davezies2021exchangeable} can be applied to analyze the limiting
-distribution of the stochastic process $\hat{f}_W$. Alternatively, again only
-when $L_n$ is non-degenerate and leading, standard empirical process methods
-could also be used. However, even in the special case when $\hat{f}_W(w)$ is
-asymptotically Donsker, our result in Lemma~\ref{lem:kernel_strong_approx_Ln}
-improves
-upon the literature by providing a rate-optimal strong approximation for
-$\hat{f}_W$ as opposed to only a weak convergence result. See Theorem
-\ref{thm:kernel_infeasible_ucb} and the subsequent discussion below.
-
-% however often non-donsker
-More importantly, as illustrated above, it is common in the literature to find
-dyadic distributions which exhibit partial or total degeneracy, making the
-process $\hat{f}_W$ non-Donsker. Thus approximating only $L_n$ is in general
-insufficient for valid uniform inference, and it is necessary to capture the
-distributional properties of $E_n$ as well.
-% we do better
-The following lemma is an application of our strong approximation result for
-empirical processes based on the Yurinskii approximation, which builds on a
-refinement by \citet{belloni2019conditional}.
-
-\begin{lemma}[Conditional strong approximation of $E_n$]
-  \label{lem:kernel_conditional_strong_approx_En}
-  %
-  Suppose Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold
-  and take any $R_n \to \infty$. For each $n$ there exists $\tilde Z^E_n$
-  a mean-zero Gaussian process conditional on $\bA_n$ satisfying
-  $\sup_{w \in \cW}
-  \big| \sqrt{n^2h} E_n(w) - \tilde Z_n^E(w) \big|
-  \lesssim_\P \frac{(\log n)^{3/8} R_n}{n^{1/4}h^{3/8}}$,
-  where $\E[\tilde Z_n^E(w)\tilde Z_n^E(w')\bigm\vert \bA_n]
-  =n^2h\E[E_n(w)E_n(w')\bigm\vert \bA_n]$
-  for all $w, w' \in \cW$.
-  %
-\end{lemma}
-
-The process $\tilde Z_n^E$ is a Gaussian process conditional on $\bA_n$ but is
-not in general a Gaussian process unconditionally. The following lemma
-constructs an unconditional Gaussian process $Z_n^E$ that approximates
-$\tilde Z_n^E$.
-
-\begin{lemma}[Unconditional strong approximation of $E_n$]
-  \label{lem:kernel_unconditional_strong_approx_En}
-
-  Suppose that Assumptions \ref{ass:kernel_data} and
-  \ref{ass:kernel_bandwidth} hold. For each $n$ there exists
-  a mean-zero Gaussian process $Z^E_n$ satisfying
-  $\E\big[ \sup_{w \in \cW} \big|\tilde Z_n^E(w) - Z_n^E(w)\big| \big]
-  \lesssim \frac{(\log n)^{2/3}}{n^{1/6}}$,
-  where $Z_n^E$ is independent of $\bA_n$ and
-  $\E[Z_n^E(w)Z_n^E(w')]=\E[\tilde Z_n^E(w)\tilde Z_n^E(w')]
-  = n^2h \, \E[E_n(w)E_n(w')]$ for all $w, w' \in \cW$.
-  %
-\end{lemma}
-
-Combining Lemmas \ref{lem:kernel_conditional_strong_approx_En}
-and~\ref{lem:kernel_unconditional_strong_approx_En}, we obtain
-an unconditional strong
-approximation for $E_n$. The resulting rate of approximation may not be
-optimal, due to the Yurinskii coupling, but to the best of our knowledge it is
-the first in the literature for the process $E_n$, and hence for $\hat{f}_W$
-and its associated $t$-process in the context of dyadic data. The approximation
-rate is sufficiently fast to allow for optimal bandwidth choices; see Section
-\ref{sec:kernel_implementation} for more details. Strong approximation results
-for
-local empirical processes (e.g.\ \citealp{gine2010confidence}) are not
-applicable here because the summands in the non-negligible $E_n$ are not
-(conditionally) i.i.d. Likewise, neither standard empirical process and
-U-process theory \citep{van1996weak,gine2021mathematical} nor the recent
-results in \citet{davezies2021exchangeable} are applicable to the non-Donsker
-process $E_n$.
-
-The previous lemmas showed that $L_n$ is $\sqrt{n}$-consistent while $E_n$ is
-$\sqrt{n^2h}$-consistent (pointwise in $w$), showcasing the importance of
-careful standardization (cf.\ Studentization in
-Section~\ref{sec:kernel_implementation}) for the purpose of rate adaptivity to
-the
-unknown degeneracy type. In other words, a challenge in conducting uniform
-inference is that the finite-dimensional distributions of the stochastic
-process $L_n+E_n$, and hence those of $\hat{f}_W$ and its associated
-$t$-process $T_n$, may converge at different rates at different points
-$w\in\cW$. The following theorem provides an (infeasible) inference procedure
-which is fully adaptive to such potential unknown degeneracy.
-
-\begin{theorem}[Strong approximation of $T_n$]
-  \label{thm:kernel_strong_approx_Tn}
-
-  Suppose that Assumptions~\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
-  hold and $f_W(w) > 0$ on $\cW$, and take any $R_n \to \infty$. Then for each
-  $n$ there exists a centered Gaussian process $Z_n^{T}$ such that
-  %
-  \begin{align*}
-    &\sup_{w \in \cW} \left| T_n(w) - Z_n^{T}(w) \right|
-    \lesssim_\P \!
-    \frac{
-      n^{-1} \! \log n
-      + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
-      + n^{-7/6} h^{-1/2} (\log n)^{2/3}
-    + h^{p\wedge\beta}}
-    {\Dl/\sqrt{n} + 1/\sqrt{n^2h}},
-  \end{align*}
-  %
-  where $\E[Z_n^T(w)Z_n^T(w')] = \E[T_n(w)T_n(w')]$ for all $w,w' \in \cW$.
-  %
-\end{theorem}
-
-The first term in the numerator corresponds to the strong approximation for
-$L_n$ in Lemma~\ref{lem:kernel_strong_approx_Ln} and the error introduced by
-$Q_n$.
-The second and third terms correspond to the conditional and unconditional
-strong approximation errors for $E_n$ in Lemmas
-\ref{lem:kernel_conditional_strong_approx_En} and
-\ref{lem:kernel_unconditional_strong_approx_En}.
-The fourth term is from
-the smoothing bias result in Theorem~\ref{thm:kernel_bias}. The denominator is
-the lower bound on the standard deviation $\Sigma_n(w,w)^{1/2}$ formulated in
-Lemma~\ref{lem:kernel_variance_bounds}.
-
-In the absence of degenerate points ($\Dl > 0$) and if $n h^{7/2}\gtrsim 1$,
-Theorem~\ref{thm:kernel_strong_approx_Tn} offers a strong approximation of the
-$t$-process at the rate $(\log n)/\sqrt{n}+\sqrt{n}h^{p\wedge\beta}$, which
-matches the celebrated KMT approximation rate for i.i.d.\ data plus the
-smoothing bias. Therefore, our novel $t$-process strong approximation can
-achieve the optimal KMT rate for non-degenerate dyadic distributions provided
-that $p\wedge\beta \geq 3.5$. This is achievable if a fourth-order
-(boundary-adaptive) kernel is used and $f_W$ is sufficiently smooth.
-
-In the presence of partial or total degeneracy ($\Dl =0$),
-Theorem~\ref{thm:kernel_strong_approx_Tn} provides a strong approximation for
-the
-$t$-process at the rate
-$\sqrt{h}\log n + n^{-1/4}h^{-3/8}(\log n)^{3/8} R_n + n^{-1/6}(\log n)^{2/3}
-+ n h^{1/2+p\wedge\beta}$. If, for example, $n h^{p\wedge\beta}\lesssim 1$,
-then our result can achieve a strong approximation rate of $n^{-1/7}$ up to
-$\log n $ terms. Theorem~\ref{thm:kernel_strong_approx_Tn} appears to be the
-first in the dyadic literature which is also robust to the presence of
-degenerate points in the underlying dyadic distribution.
-
-\subsection{Application: confidence bands}
-
-Theorem~\ref{thm:kernel_infeasible_ucb} constructs standardized confidence
-bands for
-$f_W$ which are infeasible as they depend on the unknown population variance
-$\Sigma_n$. In Section~\ref{sec:kernel_implementation} we will make this
-inference
-procedure feasible by proposing a valid estimator of the covariance function
-$\Sigma_n$ for Studentization, as well as developing bandwidth selection and
-robust bias correction methods. Before presenting our result on valid
-infeasible uniform confidence bands, we first impose in
-Assumption~\ref{ass:kernel_rates} some extra restrictions on the bandwidth
-sequence,
-which depend on the degeneracy type of the dyadic distribution, to ensure the
-coverage rate converges.
-
-\begin{assumption}[Rate restriction for uniform confidence bands]
-  \label{ass:kernel_rates}
-  Assume that one of the following holds:
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      \label{it:kernel_rate_non}
-      No degeneracy ($\Dl > 0$):
-      $n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$,
-
-    \item
-      \label{it:kernel_rate_degen}
-      Partial or total degeneracy ($\Dl = 0$):
-      $n^{-2/3} (\log n)^{7/3} \ll h
-      \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$.
-  \end{enumerate}
-\end{assumption}
-
-We now construct the infeasible uniform confidence bands.
-For $\alpha \in (0,1)$, let $q_{1-\alpha}$ be the quantile satisfying
-$ \P\left(\sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right)
-= 1 - \alpha$.
-The following result employs the anti-concentration idea due to
-\citet{chernozhukov2014anti} to deduce valid standardized confidence bands,
-where we approximate the quantile of the unknown finite sample distribution of
-$\sup_{w\in\cW} |T_n(w)|$ by the quantile $q_{1-\alpha}$ of
-$\sup_{w\in\cW}|Z_n^T(w)|$. This approach offers a better rate of convergence
-than relying on extreme value theory for the distributional approximation,
-hence improving the finite sample performance of the proposed confidence bands.
-
-\begin{theorem}[Infeasible uniform confidence bands]
-  \label{thm:kernel_infeasible_ucb}
-
-  Suppose that Assumptions~\ref{ass:kernel_data},~\ref{ass:kernel_bandwidth},
-  and~\ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then
-  %
-  \begin{align*}
-    \P\left(
-      f_W(w) \in
-      \left[ \hat f_W(w) \pm q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \, \right]
-      \, \textup{for all } w \in \cW
-    \right)
-    \to 1 - \alpha.
-  \end{align*}
-  %
-\end{theorem}
-
-By Theorem~\ref{thm:kernel_uniform_consistency}, the asymptotically optimal
-choice of
-bandwidth for uniform convergence is
-$h \asymp ((\log n)/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$.
-As discussed in the next section, the approximate
-IMSE-optimal bandwidth is $h \asymp (1/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$.
-Both bandwidth choices satisfy Assumption~\ref{ass:kernel_rates} only in the
-case of
-no degeneracy. The degenerate cases in
-Assumption~\ref{ass:kernel_rates}\ref{it:kernel_rate_degen}, which require
-$p \wedge \beta > 1$, exhibit behavior more similar to that of standard
-nonparametric kernel-based estimation and so the aforementioned optimal
-bandwidth choices will lead to a non-negligible smoothing bias in the
-distributional approximation for $T_n$. Different approaches are available in
-the literature to address this issue, including undersmoothing or ignoring the
-bias \citep{hall2001bootstrapping}, bias correction \citep{hall1992effect},
-robust bias correction \citep{calonico2018effect, calonico2022coverage}, and
-Lepskii's method
-\citep{lepskii1992asymptotically,birge2001alternative}, among others. In the
-next section we develop a feasible uniform inference procedure, based on robust
-bias correction methods, which amounts to first selecting an optimal bandwidth
-for the point estimator $\hat{f}_W$ using a $p$th-order kernel, and then
-correcting the bias of the point estimator while also adjusting the
-standardization (Studentization) when forming the $t$-statistic $T_n$.
-
-Importantly, regardless of the specific implementation details,
-Theorem~\ref{thm:kernel_infeasible_ucb} shows that any bandwidth sequence $h$
-satisfying both \ref{it:kernel_rate_non} and \ref{it:kernel_rate_degen}
-in Assumption~\ref{ass:kernel_rates} leads to valid uniform inference which is
-robust
-and adaptive to the (unknown) degeneracy type.
-
-\section{Implementation}
-\label{sec:kernel_implementation}
-
-We address outstanding implementation details to make our main uniform
-inference results feasible. In Section~\ref{sec:kernel_covariance_estimation} we
-propose a covariance estimator along with a modified version which is
-guaranteed to be positive semi-definite. This allows for the construction of
-fully feasible confidence bands in
-Section~\ref{sec:kernel_feasible_confidence_bands}.
-In Section~\ref{sec:kernel_bandwidth_selection} we discuss bandwidth selection
-and
-formalize our procedure for robust bias correction inference.
-
-\subsection{Covariance function estimation}
-\label{sec:kernel_covariance_estimation}
-
-Define the following plug-in covariance function
-estimator of $\Sigma_n$. For $w, w' \in \cW$,
-let $S_i(w) = \frac{1}{n-1} \big( \sum_{j = 1}^{i-1} k_h(W_{j i}, w)
-+ \sum_{j = i+1}^n k_h(W_{i j}, w) \big)$
-estimate $\E[k_h(W_{i j},w) \mid A_i]$ and take
-%
-\begin{align*}
-  \hat \Sigma_n(w,w')
-  &= \frac{4}{n^2} \sum_{i=1}^n S_i(w) S_i(w')
-  - \frac{4}{n^2(n-1)^2} \sum_{i<j} k_h(W_{i j},w) k_h(W_{i j},w') \\
-  &\quad- \frac{4n-6}{n(n-1)} \hat f_W(w) \hat f_W(w').
-\end{align*}
-%
-Though $\hat\Sigma_n(w,w')$ is consistent in an appropriate sense as shown in
-Lemma~\ref{lem:kernel_sdp}, it is not necessarily positive semi-definite, even
-in the limit. We therefore propose a modified covariance estimator which is
-guaranteed to be positive semi-definite. Specifically, consider the following
-optimization problem where $C_\rk$ and $C_\rL$ are as in
-Section~\ref{sec:kernel_setup}.
-%
-\begin{equation}
-  \label{eq:kernel_sdp}
-  \begin{aligned}
-    \minimize
-    \qquad
-    & \sup_{w,w' \in \cW}
-    \left|
-    \frac{M(w,w') - \hat\Sigma_n(w,w')}
-    {\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
-    \right|
-    \quad \textup{ over } M: \cW \times \cW \to \R
-    \\
-    \subjectto
-    \qquad
-    & M \textup{ is symmetric and positive semi-definite}, \\
-    & \big|M(w,w') - M(w, w'')\big|
-    \leq \frac{4}{n h^3}
-    C_\rk C_\rL
-    |w'-w''|
-    \textup{ for all }
-    w, w', w'' \in \cW.
-  \end{aligned}
-\end{equation}
-
-Denote by $\hat\Sigma_n^+$ any (approximately) optimal solution to
-\eqref{eq:kernel_sdp}. The following lemma establishes uniform convergence rates
-for both $\hat \Sigma_n$ and $\hat \Sigma_n^+$.
-We then use $\hat \Sigma_n^+$ to construct feasible versions of $T_n$ and its
-associated Gaussian approximation $Z_n^{T}$ defined in
-Theorem~\ref{thm:kernel_strong_approx_Tn}.
-%
-\begin{lemma}[Consistency of $\hat \Sigma_n$ and $\hat \Sigma_n^+$]
-  \label{lem:kernel_sdp}
-  Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold
-  and that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Then
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left| \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right|
-    &\lesssim_\P \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-  %
-  The optimization problem \eqref{eq:kernel_sdp} is a semi-definite
-  program
-  \citep[SDP,][]{laurent2005semidefinite} and has an approximately optimal
-  solution $\hat\Sigma_n^+$ satisfying
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW} \left|
-    \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right|
-    &\lesssim_\P \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-  %
-\end{lemma}
-
-In practice we take $w, w' \in \cW_d$ where $\cW_d$ is a finite subset of
-$\cW$, typically taken to be an equally-spaced grid. This yields
-finite-dimensional covariance matrices, for which \eqref{eq:kernel_sdp} can be
-solved
-in polynomial time in $|\cW_d|$ using a general-purpose SDP solver
-\citep[e.g.\ by interior point methods,][]{laurent2005semidefinite}.
-The number of points in $\cW_d$ should be taken as large as is computationally
-practical in order to generate confidence bands rather than merely simultaneous
-confidence intervals. It is worth noting that the complexity of solving
-\eqref{eq:kernel_sdp} does not depend on the number of vertices $n$, and so
-does not
-influence the ability of our methodology to handle large and possibly sparse
-networks.
-
-The bias-corrected variance estimator in
-\citet[Section~3.2]{matsushita2021jackknife} takes a similar form to our
-estimator
-$\hat\Sigma_n$ but in the parametric setting, and is therefore also not
-guaranteed to be positive semi-definite in finite samples. Our approach
-addresses this issue, ensuring a positive semi-definite estimator
-$\hat\Sigma_n^+$ is always available.
-
-\subsection{Feasible confidence bands}
-\label{sec:kernel_feasible_confidence_bands}
-
-Given a choice of the kernel order $p$ and a bandwidth $h$, we construct a
-valid confidence band that is implementable in practice. Define the Studentized
-$t$-statistic process
-%
-\begin{align*}
-  \hat T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\hat \Sigma_n^+(w,w)}}.
-\end{align*}
-%
-Let $\hat Z_n^T(w)$ be a process which, conditional on the data $\bW_n$,
-is mean-zero and Gaussian, whose conditional covariance structure is
-$\E\big[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm\vert \bW_n \big]
-= \frac{\hat \Sigma_n^+(w,w')}
-{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$.
-For $\alpha \in (0,1)$, let $\hat q_{1-\alpha}$ be the
-conditional quantile satisfying
-$\P\big(\sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq \hat q_{1-\alpha}
-\bigm\vert \bW_n \big) = 1 - \alpha$,
-which is shown to be well defined in Section~\ref{sec:kernel_app_proofs}.
-
-\begin{theorem}[Feasible uniform confidence bands]
-  \label{thm:kernel_ucb}
-
-  Suppose that Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth},
-  and \ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then
-  %
-  \begin{align*}
-    \P\left(
-      f_W(w) \in
-      \left[ \hat f_W(w) \pm \hat q_{1-\alpha}
-      \sqrt{\hat\Sigma_n^+(w,w)} \,\right]
-      \,\textup{for all } w \in \cW
-    \right) \to 1 - \alpha.
-  \end{align*}
-  %
-\end{theorem}
-
-Recently, \citet{chiang2022inference} derived high-dimensional central limit
-theorems over rectangles for exchangeable arrays and applied them to construct
-simultaneous confidence intervals for a sequence of design points. Their
-inference procedure relies on the multiplier bootstrap, and their conditions
-for valid inference depend on the number of design points considered. In
-contrast, Theorem~\ref{thm:kernel_ucb} constructs a feasible uniform confidence
-band over the entire domain of inference $\cW$ based on our strong
-approximation results for the whole $t$-statistic process and the covariance
-estimator $\hat\Sigma_n^+$. The required rate condition specified in
-Assumption~\ref{ass:kernel_rates} does not depend on the number of design
-points.
-Furthermore, our proposed inference methods are robust to potential unknown
-degenerate points in the underlying dyadic data generating process.
-
-In practice, suprema over $\cW$ can be replaced by maxima over sufficiently
-many design points in $\cW$. The conditional quantile $\hat q_{1-\alpha}$ can
-be estimated by Monte Carlo simulation, resampling from the Gaussian process
-defined by the law of $\hat Z_n^T \mid \bW_n$.
-
-The bandwidth restrictions in Theorem~\ref{thm:kernel_ucb} are the same as
-those for the infeasible version given in
-Theorem~\ref{thm:kernel_infeasible_ucb},
-namely those imposed in Assumption \ref{ass:kernel_rates}. This follows from
-the rates
-of convergence obtained in Lemma~\ref{lem:kernel_sdp}, coupled with some careful
-technical work given in Section~\ref{sec:kernel_app_proofs} to handle the
-potential
-presence of degenerate points in $\Sigma_n$.
-
-\subsection{Bandwidth selection and robust bias-corrected inference}
-\label{sec:kernel_bandwidth_selection}
-
-We give practical suggestions for selecting the bandwidth parameter $h$.
-Let $\nu(w)$ be a non-negative real-valued function on $\cW$ and suppose we use
-a kernel of order $p < \beta$ of the form $k_h(s,w) = K\big((s-w) / h\big)/h$.
-The $\nu$-weighted asymptotic IMSE (AIMSE) is minimized by
-%
-\begin{align*}
-  h^*_{\AIMSE}
-  &=
-  \left(
-    \frac{p!(p-1)!
-      \Big(\int_\cW f_W(w) \nu(w) \diff{w}\Big)
-    \Big(\int_\R K(w)^2 \diff{w}\Big)}
-    {2 \Big(
-        \int_{\cW}
-        f_W^{(p)}(w)^2
-        \nu(w)
-        \diff{w}
-      \Big)
-      \Big(
-        \int_\R
-        w^p K(w)
-        \diff{w}
-      \Big)^2
-    }
-  \right)^{\frac{1}{2p+1}}
-  \left( \frac{n(n-1)}{2} \right)^{-\frac{1}{2p+1}}.
-\end{align*}
-%
-This is akin to the AIMSE-optimal bandwidth choice for traditional monadic
-kernel density estimation with a sample size of $\frac{1}{2}n(n-1)$. The choice
-$h^*_{\AIMSE}$ is slightly undersmoothed (up to a polynomial $\log n$ factor)
-relative to the uniform minimax-optimal bandwidth choice discussed in
-Section~\ref{sec:kernel_point_estimation}, but it is easier to implement in
-practice.
-
-To implement the AIMSE-optimal bandwidth choice, we propose a simple
-rule-of-thumb (ROT) approach based on Silverman's rule.
-Suppose $p\wedge\beta=2$ and let $\hat\sigma^2$ and $\hat I$
-be the sample variance and sample interquartile range respectively
-of the data $\bW_n$. Then
-$\hat{h}_{\ROT} = C(K) \big( \hat\sigma \wedge
-\frac{\hat I}{1.349} \big) \big(\frac{n(n-1)}{2} \big)^{-1/5}$,
-where we have $C(K)=2.576$ for the triangular kernel $K(w) = (1 - |w|) \vee 0$,
-and $C(K)=2.435$ for the Epanechnikov kernel
-$K(w) = \frac{3}{4}(1 - w^2) \vee 0$.
-
-The AIMSE-optimal bandwidth selector $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$
-and any of its feasible estimators only satisfy
-Assumption~\ref{ass:kernel_rates} in
-the case of no degeneracy ($\Dl>0$). Under partial or total degeneracy, such
-bandwidths are not valid due to the usual leading smoothing (or
-misspecification) bias of the distributional approximation. To circumvent this
-problem and construct feasible uniform confidence bands for $f_W$, we employ
-the following robust bias correction approach.
-
-\begin{algorithm}[b!]
-  \caption{Feasible uniform confidence bands}
-  \label{alg:kernel_method}
-  \setstretch{1.5}
-
-  Choose a kernel $k_h$ of order $p \geq 2$ satisfying
-  Assumption~\ref{ass:kernel_bandwidth}. \\
-
-  Select a bandwidth $h \approx h^*_{\AIMSE}$ for $k_h$
-  as in Section~\ref{sec:kernel_bandwidth_selection},
-  perhaps using $h = \hat{h}_{\ROT}$. \\
-
-  Choose another kernel $k_h'$ of order $p'>p$ satisfying
-  Assumption~\ref{ass:kernel_bandwidth}.
-
-  For $d \geq 1$, choose a set of $d$ distinct evaluation points $\cW_d$. \\
-
-  For each $w \in \cW_d$, construct the density estimate $\hat f_W(w)$
-  using $k'_{h}$ as in Section~\ref{sec:kernel_introduction}. \\
-
-  For $w, w' \in \cW_d$, estimate the covariance $\hat \Sigma_n(w,w')$
-  using $k'_{h}$ as in Section~\ref{sec:kernel_covariance_estimation}. \\
-
-  Construct positive semi-definite
-  covariance estimate $\hat \Sigma_n^+$
-  as in Section~\ref{sec:kernel_covariance_estimation}. \\
-
-  For $B \geq 1$, let $(\hat Z_{n,r}^T: 1\leq r\leq B)$ be i.i.d.\
-  from $\hat{Z}_n^T$ as in Section~\ref{sec:kernel_feasible_confidence_bands}.
-  \\
-
-  For $\alpha \in (0,1)$, set
-  $\hat q_{1-\alpha} = \inf_{q \in \R}
-  \{ q : \# \{r: \max_{w\in\cW_d}|\hat Z_{n,r}^T(w)| \leq q \}
-  \geq B(1-\alpha) \}$. \\
-
-  Construct $ \big[\hat f_W(w) \pm
-  \hat q_{1-\alpha} \hat\Sigma_n^+(w,w)^{1/2} \big]$ for each $w \in \cW_d$.
-  %
-\end{algorithm}
-
-Firstly, estimate the bandwidth $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ using a
-kernel of order $p$, which leads to an AIMSE-optimal point estimator
-$\hat{f}_W$ in an $L^2(\nu)$ sense. Then use this bandwidth and a kernel of
-order $p' > p$ to construct the statistic $\hat T_n$ and the confidence band as
-detailed in Section~\ref{sec:kernel_feasible_confidence_bands}. Importantly,
-both
-$\hat{f}_W$ and $\hat{\Sigma}^+_n$ are recomputed with the new higher-order
-kernel. The change in centering is equivalent to a bias correction of the
-original AIMSE-optimal point estimator, while the change in scale captures the
-additional variability introduced by the bias correction itself. As shown
-formally in \citet{calonico2018effect, calonico2022coverage} for the case of
-kernel-based density
-estimation with i.i.d.\ data, this approach leads to higher-order refinements
-in the distributional approximation whenever additional smoothness is available
-($p'\leq\beta$). In the present dyadic setting, this procedure is valid so long
-as $n^{-2/3} (\log n)^{7/3} \ll n^{-\frac{2}{2p+1}}
-\ll (n^2 \log n)^{-\frac{1}{2p' + 1}}$,
-which is equivalent to $2 \leq p < p'$.
-For concreteness, we recommend taking $p = 2$ and $p' = 4$,
-and using the rule-of-thumb bandwidth choice $\hat{h}_{\ROT}$ defined above.
-In particular, this approach automatically delivers a KMT-optimal
-strong approximation whenever there are no degeneracies in the
-underlying dyadic data generating process.
-Our feasible robust bias correction method based on AIMSE-optimal dyadic
-kernel density estimation for constructing uniform confidence bands
-for $f_W$ is summarized in Algorithm~\ref{alg:kernel_method}.
-
-\section{Simulations}
-\label{sec:kernel_simulations}
-
-We investigate the empirical finite-sample performance of the kernel density
-estimator with dyadic data using simulations. The family of dyadic
-distributions defined in Section~\ref{sec:kernel_degeneracy}, with its three
-parameterizations, is used to generate data sets with different degeneracy
-types.
-
-We use two different boundary bias-corrected Epanechnikov kernels of orders
-$p=2$ and $p=4$ respectively, on the inference domain $\cW = [-2,2]$. We select
-an optimal bandwidth for $p=2$ as recommended in
-Section~\ref{sec:kernel_bandwidth_selection}, using the rule-of-thumb with
-$C(K) = 2.435$. The semi-definite program in
-Section~\ref{sec:kernel_covariance_estimation} is solved with the MOSEK
-interior point
-optimizer \citep{mosek}, ensuring positive semi-definite covariance estimates.
-Gaussian vectors are resampled $B = 10\,000$ times.
-
-\begin{figure}[b!]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/outcome_plot_total.pdf}
-    \caption{Total degeneracy, \\
-    $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/outcome_plot_partial.pdf}
-    \caption{Partial degeneracy, \\
-    $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/outcome_plot_none.pdf}
-    \caption{No degeneracy, \\
-    $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.}
-  \end{subfigure}
-  %
-  \caption[Typical outcomes for different values of the parameter $\pi$]
-  {Typical outcomes for three different values of the parameter $\pi$.}
-  %
-  \label{fig:kernel_results}
-  %
-\end{figure}
-
-In Figure~\ref{fig:kernel_results} we plot a typical outcome for each of the
-three
-degeneracy types (total, partial, none), using the Epanechnikov kernel of order
-$p=2$, with sample size $n=100$ (so $N=4950$ pairs of nodes) and with $d=100$
-equally-spaced evaluation points. Each plot contains the true density function
-$f_W$, the dyadic kernel density estimate $\hat f_W$ and two different
-approximate $95\%$ confidence bands for $f_W$. The first is the uniform
-confidence band (UCB) constructed using one of our main results,
-Theorem~\ref{thm:kernel_ucb}. The second is a sequence of pointwise confidence
-intervals (PCI) constructed by finding a confidence interval for each
-evaluation point separately. We show only $10$ pointwise confidence intervals
-for clarity. In general, the PCIs are too narrow as they fail to provide
-simultaneous (uniform) coverage over the evaluation points. Note that under
-partial degeneracy the confidence band narrows near the degenerate point
-$w = 0$.
-
-\begin{table}[b!]
-  \centering
-  \begin{tabular}{|c|c|c|c|c|cc|cc|}
-    \hline
-    \multirow{2}{*}{$ \pi $}
-    & \multirow{2}{*}{Degeneracy type}
-    & \multirow{2}{*}{$ \hat h_{\ROT} $}
-    & \multirow{2}{*}{$ p $}
-    & \multirow{2}{*}{RIMSE}
-    & \multicolumn{2}{|c|}{UCB}
-    & \multicolumn{2}{|c|}{PCI} \\
-    \cline{6-9}
-    & & & &
-    & CR & AW
-    & CR & AW \\
-    \hline
-    \multirow{2}{*}{$ \left(\frac{1}{2}, 0, \frac{1}{2}\right) $}
-    & \multirow{2}{*}{Total}
-    & \multirow{2}{*}{0.161}
-    & 2 & 0.00048 & 87.1\% & 0.0028 & 6.5\% & 0.0017 \\
-    & & & 4 & 0.00068 & 95.2\% & 0.0042 & 9.7\% & 0.0025 \\
-    \hline
-    \multirow{2}{*}{$ \left(\frac{1}{4}, 0, \frac{3}{4}\right) $}
-    & \multirow{2}{*}{Partial}
-    & \multirow{2}{*}{0.158}
-    & 2 & 0.00228 & 94.5\% & 0.0112 & 75.6\% & 0.0083 \\
-    & & & 4 & 0.00234 & 94.7\% & 0.0124 & 65.3\% & 0.0087 \\
-    \hline
-    \multirow{2}{*}{$ \left(\frac{1}{5}, \frac{1}{5}, \frac{3}{5}\right) $}
-    & \multirow{2}{*}{None}
-    & \multirow{2}{*}{0.145}
-    & 2 & 0.00201 & 94.2\% & 0.0106 & 73.4\% & 0.0077 \\
-    & & & 4 & 0.00202 & 95.6\% & 0.0117 & 64.3\% & 0.0080 \\
-    \hline
-  \end{tabular}
-  \caption[Numerical results for three values of the parameter $\pi$]{
-  Numerical results for three values of the parameter $\pi$.}
-  \label{tab:kernel_results}
-\end{table}
-
-Next, Table~\ref{tab:kernel_results} presents numerical results. For each
-degeneracy
-type (total, partial, none) and each kernel order ($p=2$, $p=4$), we run $2000$
-repeats with sample size $n=3000$ (giving $N=4\,498\,500$ pairs of nodes) and
-with $d=50$ equally-spaced evaluation points. We record the average
-rule-of-thumb bandwidth $\hat{h}_{\ROT}$ and the average root integrated mean
-squared error (RIMSE). For both the uniform confidence bands (UCB) and the
-pointwise confidence intervals (PCI), we report the coverage rate (CR) and the
-average width (AW).
-%
-The lower-order kernel ($p=2$) ignores the bias, leading to good RIMSE
-performance and acceptable UCB coverage under partial or no degeneracy, but
-gives invalid inference under total degeneracy. In contrast, the higher-order
-kernel ($p=4$) provides robust bias correction and hence improves the coverage
-of the UCB in every regime, particularly under total degeneracy, at the cost of
-increasing both the RIMSE and the average widths of the confidence bands.
-%
-As expected, the pointwise (in $w\in\cW$) confidence intervals (PCIs) severely
-undercover in every regime. Thus our simulation results show that the proposed
-feasible inference methods based on robust bias correction and proper
-Studentization deliver valid uniform inference which is robust to unknown
-degenerate points in the underlying dyadic distribution.
-
-\section{Counterfactual dyadic density estimation}
-\label{sec:kernel_counterfactual}
-
-To further showcase the applicability of our main results, we develop a kernel
-density estimator for dyadic counterfactual distributions. The aim of such
-counterfactual analysis is to estimate the distribution of an outcome variable
-had some covariates followed a distribution different from the actual one, and
-it is important in causal inference and program evaluation settings
-\citep{dinardo1996distribution,chernozhukov2013inference}.
-
-For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be random
-variables as defined in Assumption~\ref{ass:kernel_data} and
-$\bX_n^r = (X_1^r, \ldots, X_n^r)$ be some covariates.
-We assume that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$
-and that $\bX_n^r$ is independent of $\bV_n^r$, that
-$W_{i j}^r \mid X_i^r, X_j^r$ has a conditional Lebesgue density
-$f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$,
-that $X_i^r$ follows a distribution function $F_X^r$ on a common support $\cX$,
-and that $(\bA_n^0, \bV_n^0, \bX_n^0)$
-is independent of $(\bA_n^1, \bV_n^1, \bX_n^1)$.
-
-We interpret $r$ as an index for two populations, labeled $0$ and $1$. The
-counterfactual density of population $1$ had it followed the
-same covariate distribution as population $0$ is
-%
-\begin{align*}
-  f_W^{1 \triangleright 0}(w)
-  &= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] \\
-  &= \int_{\cX} \int_{\cX} f_{W \mid XX}^{1}(w \mid x_1, x_2)
-  \psi(x_1) \psi(x_2) \diff F_X^{1}(x_1) \diff F_X^{1}(x_2),
-\end{align*}
-%
-where $\psi(x) = \mathrm{d} F_X^0(x) / \mathrm{d} F_X^1(x)$ for $x \in \cX$
-is a Radon--Nikodym derivative. If $X^0_i$ and $X^1_i$ have Lebesgue densities,
-it is natural to consider a parametric model of the form
-$\mathrm{d} F_X^{r}(x)=f_X^r(x;\theta)\diff x$
-for some finite-dimensional parameter $\theta$.
-Alternatively, if the covariates $X_n^r$ are discrete and have a positive
-probability mass function $p_X^r(x)$ on a finite
-support $\cX$, the object of interest becomes
-$f_W^{1 \triangleright 0}(w)
-= \sum_{x_1 \in \cX} \sum_{x_2 \in \cX}
-f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2)
-p_X^{1}(x_1) p_X^{1}(x_2)$,
-where $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$.
-We consider discrete covariates for simplicity,
-and hence the counterfactual dyadic kernel density estimator is
-%
-\begin{align*}
-  \hat f_W^{\,1 \triangleright 0}(w)
-  &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n
-  \hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w),
-\end{align*}
-%
-where $\hat\psi(x) = \hat p_X^{\,0}(x) / \hat p_X^{\,1}(x)$ and
-$\hat p_X^{\,r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$,
-with $\I$ the indicator function.
-
-Section~\ref{sec:kernel_app_main} provides technical details:
-we show how an asymptotic linear representation for $\hat\psi(x)$ leads to a
-Hoeffding-type decomposition of $\hat f_W^{\,1 \triangleright 0}(w)$,
-which is then used to establish that $\hat f_W^{\,1 \triangleright 0}$
-is uniformly consistent for $f_W^{\,1 \triangleright 0}(w)$
-and also admits a Gaussian strong approximation, with the same rates of
-convergence as for the standard density estimator. Furthermore, define the
-covariance function of $\hat f_W^{\,1 \triangleright 0}(w)$ as
-$\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\big[
-  \hat f_W^{\,1 \triangleright 0}(w),
-\hat f_W^{\,1 \triangleright 0}(w') \big]$,
-which can be estimated as follows. First let
-$\hat\kappa(X_i^0, X_i^1, x)
-= \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)}
-- \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat
-p_X^1(x)}{\hat p_X^1(x)}$
-be a plug-in estimate of the influence function for $\hat\psi(x)$
-and define the leave-one-out conditional expectation estimators
-$S_i^{1 \triangleright 0}(w)
-= \frac{1}{n-1} \big( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1)
-+ \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \big)$
-and $\tilde S_i^{1 \triangleright 0}(w)
-= \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\}
-\hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w)$.
-Define the covariance estimator
-%
-\begin{align*}
-  \hat\Sigma_n^{1 \triangleright 0}(w,w')
-  &= \frac{4}{n^2} \sum_{i=1}^n
-  \big(
-    \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w)
-    + \tilde S_i^{1 \triangleright 0}(w)
-  \big)
-  \big(
-    \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w')
-    + \tilde S_i^{1 \triangleright 0}(w')
-  \big) \\
-  &\quad-
-  \frac{4}{n^3(n-1)}
-  \sum_{i<j} k_h(W_{i j}^1, w) k_h(W_{i j}^1, w')
-  \hat\psi(X_i^1)^2 \hat\psi(X_j^1)^2
-  - \frac{4}{n}
-  \hat f_W^{\,1 \triangleright 0}(w) \hat f_W^{\,1 \triangleright 0}(w').
-\end{align*}
-%
-We use a positive semi-definite approximation to
-$\hat\Sigma_n^{1 \triangleright 0}$, denoted by
-$\hat\Sigma_n^{+, 1 \triangleright 0}$,
-as in Section~\ref{sec:kernel_covariance_estimation}. To construct feasible
-uniform
-confidence bands, define a process $\hat Z_n^{T, 1 \triangleright 0}(w)$ which
-is conditionally mean-zero and Gaussian given the data $\bW_n^1$, $\bX_n^0$, and
-$\bX_n^1$, and whose conditional covariance structure is
-$\E\big[\hat Z_n^{T, 1 \triangleright 0}(w)
-  \hat Z_n^{T, 1 \triangleright 0}(w')
-\bigm| \bW_n^1, \bX_n^0, \bX_n^1 \big]
-= \frac{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w')}
-{\sqrt{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w)
-\hat \Sigma_n^{+, 1 \triangleright 0}(w',w')}}$.
-For $\alpha \in (0,1)$, define
-$\hat q_{1-\alpha}^{\,1 \triangleright 0}$
-as the quantile satisfying
-$\P\big(\sup_{w \in \cW}\big| \hat Z_n^{T, 1 \triangleright 0}(w) \big|
-  \leq \hat q_{1-\alpha}^{\,1 \triangleright 0}
-\bigm\vert \bW_n^1, \bX_n^0, \bX_n^1 \big)
-= 1 - \alpha$.
-Then if the covariance estimator is appropriately consistent,
-%
-\begin{align*}
-  \P\left(
-    f_W^{1 \triangleright 0}(w) \in
-    \left[
-      \hat f_W^{\,1 \triangleright 0}(w)
-      \pm \hat q^{\,1 \triangleright 0}_{1-\alpha}
-      \sqrt{\hat\Sigma_n^{+, 1 \triangleright 0}(w,w)}
-    \,\right]
-    \,\textup{for all } w \in \cW
-  \right) \to 1 - \alpha,
-\end{align*}
-%
-giving feasible uniform inference methods, which are robust to unknown
-degeneracies, for counterfactual distribution analysis in dyadic data settings.
-
-\subsection{Application to trade data}
-\label{sec:kernel_trade_data}
-
-We illustrate the performance of our estimation and inference methods with a
-real-world data set. We use international bilateral trade data from the
-International Monetary Fund's Direction of Trade Statistics (DOTS), previously
-analyzed by \citet{head2014gravity} and \citet{chiang2022inference}. This data
-set contains information about the yearly trade flows among $n = 207$ economies
-($N = 21\,321$ pairs), and we focus on the years $1995$, $2000$, and $2005$.
-
-We define the \emph{trade volume} between countries $i$ and $j$ as the
-logarithm of the sum of the trade flow (in billions of US dollars) from $i$ to
-$j$ and the trade flow from $j$ to $i$. In each year several pairs of countries
-did not trade directly, yielding trade flows of zero and hence a trade volume
-of $-\infty$. We therefore assume that the distribution of trade volumes is a
-mixture of a point mass at $-\infty$ and a Lebesgue density on $\R$. The local
-nature of our estimator means that observations taking the value of $-\infty$
-can simply be removed from the data set.
-Table~\ref{tab:kernel_trade_network_stats}
-gives summary statistics for these trade networks, and shows how the networks
-become more connected over time, with edge density, average degree, and
-clustering coefficient increasing.
-
-\begin{table}[b!]
-  \centering
-  \begin{tabular}{|c|c|c|c|c|c|}
-    \hline
-    Year & Nodes & Edges & Edge density & Average degree
-    & Clustering coefficient \\
-    \hline
-    1995 & 207 & 11\,603 & 0.5442 & 112.1 & 0.7250 \\
-    2000 & 207 & 12\,528 & 0.5876 & 121.0 & 0.7674 \\
-    2005 & 207 & 12\,807 & 0.6007 & 123.7 & 0.7745 \\
-    \hline
-  \end{tabular}
-  \caption[Summary statistics for the DOTS trade networks]{
-  Summary statistics for the DOTS trade networks.}
-  \label{tab:kernel_trade_network_stats}
-\end{table}
-
-For counterfactual analysis we use the gross domestic product (GDP) of each
-country as a covariate, using $10\%$-percentiles to group the values into $10$
-different levels for ease of estimation. This allows for a comparison of the
-observed distribution of trade at each year with, for example, the
-counterfactual distribution of trade had the GDP distribution remained as it
-was in $1995$. As such, we can measure how much of the change in trade
-distribution is attributable to a shift in the GDP distribution.
-
-To estimate the trade volume density function we use
-Algorithm~\ref{alg:kernel_method}
-with $d=100$ equally-spaced evaluation points in $[-10,10]$, using the
-rule-of-thumb bandwidth selector $\hat h_{\ROT}$ from
-Section~\ref{sec:kernel_bandwidth_selection} with $p=2$ and $C(K) = 2.435$. For
-inference we use an Epanechnikov kernel of order $p=4$ and resample the
-Gaussian process $B = 10\,000$ times. We also estimate the counterfactual trade
-distributions in 2000 and 2005 respectively, replacing the GDP distribution
-with that from 1995. For each year, Figure~\ref{fig:kernel_trade} plots the
-real and
-counterfactual density estimates along with their respective uniform confidence
-bands (UCB) at the nominal coverage rate of $95\%$. Our empirical results show
-that the counterfactual distribution drifts further from the truth in 2005
-compared with 2000, indicating a shift in the GDP distribution.
-
-\begin{figure}[t]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_1995.pdf}
-    \caption{Year 1995, $\hat h_{\ROT} = 1.27$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_1995_2000.pdf}
-    \caption{Year 2000, $\hat h_{\ROT} = 1.31$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_1995_2005.pdf}
-    \caption{Year 2005, $\hat h_{\ROT} = 1.37$.}
-  \end{subfigure}
-  %
-  \caption[Histogram-based estimation and inference for the DOTS data]{
-    Real and counterfactual density estimates and confidence bands for
-  the DOTS data with histogram-based covariate estimation.}
-  %
-  \label{fig:kernel_trade}
-  %
-\end{figure}
-
-\begin{figure}[b!]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_gdp_1995.pdf}
-    \caption{Year 1995}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_gdp_2000.pdf}
-    \caption{Year 2000}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_gdp_2005.pdf}
-    \caption{Year 2005}
-  \end{subfigure}
-  %
-  \caption[Estimated GDP distributions for the DOTS data]{
-    Estimated GDP distributions for the DOTS data using histograms and
-  normal likelihood maximization.}
-  %
-  \label{fig:kernel_gdp}
-  %
-\end{figure}
-
-In Figure~\ref{fig:kernel_gdp} we illustrate how, in the preliminary step of the
-counterfactual analysis, the distribution of log GDP is approximated using the
-histogram estimators $\hat p_X^{\,0}$ and $\hat p_X^{\,1}$ defined in
-Section~\ref{sec:kernel_counterfactual}. We also plot the density function of a
-normal distribution, fitted using maximum likelihood estimation, and this seems
-to capture the distribution of log GDP reasonably well. Such a parametric
-approach to the preliminary step may be favored in cases where a choice of
-model is clear or where the histogram estimators perform poorly.
-
-To demonstrate the relative robustness of our counterfactual analysis to the
-choice of preliminary estimation step, we provide results using a
-parametric estimator of the distribution of GDP.
-Figure~\ref{fig:kernel_trade_para}
-repeats the procedure used for Figure~\ref{fig:kernel_trade}, but this time
-replacing
-the histogram estimators by parametric estimators of the log GDP based on
-normal likelihood maximization. The point estimates are qualitatively similar,
-with the counterfactual distribution drifting in the same direction over time.
-The confidence bands are also similar, with the band based on the parametric
-fit being slightly narrower in general. This could be due to the more stringent
-model specification leading to less estimated variance in the fitted values.
-
-\begin{figure}[t]
-  \centering
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995.pdf}
-    \caption{Year 1995, $\hat h_{\ROT} = 1.27$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995_2000.pdf}
-    \caption{Year 2000, $\hat h_{\ROT} = 1.31$.}
-  \end{subfigure}
-  %
-  \begin{subfigure}{0.32\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995_2005.pdf}
-    \caption{Year 2005, $\hat h_{\ROT} = 1.37$.}
-  \end{subfigure}
-  %
-  \caption[Parametric likelihood-based estimation and
-  inference for the DOTS data]{
-    Real and counterfactual density estimates and confidence bands for
-  the DOTS data with parametric covariate estimation.}
-  %
-  \label{fig:kernel_trade_para}
-  %
-\end{figure}
-
-\section{Other applications and future work}
-\label{sec:kernel_future}
-
-To emphasize the broad applicability of our methods to network science
-problems, we present three application scenarios. The first concerns comparison
-of networks \citep{kolaczyk2009statistical}, while the second and third involve
-nonparametric and semiparametric dyadic regression respectively.
-
-Firstly, consider the setting where there are two independent networks with
-continuous dyadic covariates $\bW_n^0$ and $\bW_m^1$ respectively.
-Practitioners may wish to test if these two dyadic distributions are the same,
-that is, whether their density functions $f_W^0$ and $f_W^1$ are equal on their
-common support $\cW \subseteq \R$. We present a family of hypothesis tests for
-this scenario based on dyadic kernel density estimation. Let $\hat
-f_W^{\,0}(w)$ and $\hat f_W^{\,1}(w)$ be the associated (bias-corrected) dyadic
-kernel density estimators. Consider the test statistics $\tau_p$ for
-$1 \leq p \leq \infty$ where
-%
-\begin{align}
-  \nonumber
-  \tau_p^p
-  &= \int_{-\infty}^{\infty}
-  \left| \hat f_W^{\,1}(w) - \hat f_W^{\,0}(w) \right|^p
-  \diff w
-  \ \text{ for } p < \infty, \\
-  \label{eq:kernel_hypothesis_test}
-  \tau_\infty
-  &= \sup_{w \in \cW} \left| \hat f_W^{\,1}(w) - \hat f_W^{\,0}(w) \right|.
-\end{align}
-%
-Clearly, we should reject the null hypothesis that $f_W^0 = f_W^1$ whenever the
-test statistic $\tau_p$ is sufficiently large. To estimate the critical value,
-let $\hat\Sigma_n^{+,0}(w, w')$ and $\hat\Sigma_m^{+,1}(w, w')$ be the positive
-semi-definite estimators defined in
-Section~\ref{sec:kernel_covariance_estimation} and
-let $\hat Z^0_n(w)$ and $\hat Z^1_m(w)$ be zero-mean Gaussian processes with
-covariance structures $\hat\Sigma_n^{+,0}(w, w')$ and
-$\hat\Sigma_m^{+,1}(w, w')$ respectively, which are independent conditional on
-the data. Define the approximate null test statistic $\hat \tau_p$ by replacing
-$\hat f_W^{\,0}(w)$ and $\hat f_W^{\,1}(w)$ with $\hat Z^0_n(w)$ and
-$\hat Z^1_m(w)$ respectively in \eqref{eq:kernel_hypothesis_test}.
-For a significance level
-$\alpha \in (0,1)$, the critical value is $\hat C_\alpha$ where
-%
-$\P \big(
-  \hat \tau_p \geq \hat C_\alpha \bigm\vert \bW_n^0, \bW_n^1
-\big) = \alpha$.
-%
-This is estimated by Monte Carlo simulation, resampling from the conditional
-law of $\hat Z^0_n(w)$ and $\hat Z^1_m(w)$ and replacing integrals and suprema
-by sums and maxima over a finite partition of $\cW$.
-
-While our focus has been on density estimation with dyadic data,
-our uniform dyadic estimation and inference results are readily applicable
-to the settings of nonparametric and semiparametric dyadic regression.
-For a second example, suppose $Y_{i j} = Y(X_i, X_j, A_i, A_j, V_{i j})$,
-where only $\bX_n$ and $\bY_n$ are observed and
-$\bV_n$ is independent of $(\bX_n, \bA_n)$,
-with $\bX_n = (X_i : 1 \leq i \leq n)$,
-$\bA_n = (A_i : 1 \leq i \leq n)$, $\bY_n = (Y_{i j}:1\leq i<j\leq n)$,
-and $\bV_n = (V_{i j} : 1 \leq i < j \leq n)$.
-A parameter of interest is the regression function
-$\mu(x_1, x_2) = \E[Y_{i j} \mid X_i=x_1, X_j=x_2]$,
-which can be used to analyze average or partial effects
-of changing the node attributes $X_i$ and $X_j$ on the edge variable $Y_{i j}$.
-This conditional expectation could be estimated using local polynomial methods:
-suppose that $X_i$ takes values in $\R^m$ and
-let $r(x_1, x_2)$ be a monomial basis up to degree
-$\gamma \geq 0$ on $\R^m \times \R^m$. Then, for some bandwidth $h > 0$ and
-a kernel function $k_h$ on $\R^m \times \R^m$,
-the local polynomial regression estimator of $\mu(x_1, x_2)$ is
-$\hat\mu(x_1, x_2) = e_1^\T \hat\beta(x_1, x_2)$ where
-$e_1$ is the first standard unit vector in $\R^q$ for
-$q=\binom{2m+\gamma}{\gamma}$ and
-%
-\begin{align}
-  \nonumber
-  \hat{\beta}(x_1, x_2)
-  &=
-  \argmin_{\beta \in \R^q}
-  \sum_{i=1}^{n-1} \sum_{j=i+1}^n
-  \left( Y_{i j} - r(X_i-x_1, X_j-x_2)^\T \beta \right)^2
-  k_h(X_i-x_1, X_j-x_2) \\
-  \label{eq:kernel_locpol}
-  &=
-  \left(
-    \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} r_{i j}^\T
-  \right)^{-1}
-  \left(
-    \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} Y_{i j}
-  \right),
-\end{align}
-%
-with $k_{i j} = k_h(X_i-x_1, X_j-x_2)$ and $r_{i j} = r(X_i-x_1, X_j-x_2)$.
-\citet{graham2021minimax} established pointwise distribution theory
-for the special case of the dyadic Nadaraya--Watson kernel regression estimator
-($\gamma=0$), but no uniform analogues have yet been given. It can be shown
-that the ``denominator'' matrix in \eqref{eq:kernel_locpol} converges uniformly
-to its
-expectation, while the U-process-like ``numerator'' matrix can be handled the
-same way as we analyzed $\hat f_W(w)$ in this chapter, through a Hoeffding-type
-decomposition and strong approximation methods, along with standard bias
-calculations. Such distributional approximation results can be used to
-construct valid uniform confidence bands for the regression function
-$\mu(x_1, x_2)$, as well as to conduct hypothesis testing for parametric
-specifications or shape constraints.
-
-As a third example, we consider applying our results to semiparametric
-semi-linear regression problems. The dyadic semi-linear regression model is
-$\E[Y_{i j} \mid W_{i j}, X_i, X_j] = \theta^\T W_{i j} + g(X_i, X_j)$
-where $\theta$ is the finite-dimensional parameter of interest
-and $g(X_i, X_j)$ is an unknown function of the covariates $(X_i, X_j)$.
-Local polynomial (or other) methods can be used to estimate $\theta$ and $g$,
-where the estimator of the nonparametric component $g$ takes a similar form to
-\eqref{eq:kernel_locpol}, that is, a ratio of two kernel-based estimators as in
-\eqref{eq:kernel_estimator}. Consequently, the strong approximation techniques
-presented in this chapter can be appropriately modified to develop valid
-uniform inference procedures for $g$ and
-$\E[Y_{i j} \mid W_{i j}=w, X_i=x_1, X_j=x_2]$, as well as functionals thereof.
-
-\section{Conclusion}
-\label{sec:kernel_conclusion}
-
-We studied the uniform estimation and inference properties of the dyadic kernel
-density estimator $\hat{f}_W$ given in \eqref{eq:kernel_estimator}, which forms
-a class of U-process-like estimators indexed by the $n$-varying kernel function
-$k_h$ on $\cW$. We established uniform minimax-optimal point estimation results
-and uniform distributional approximations for this estimator based on novel
-strong approximation strategies. We then applied these results to derive valid
-and feasible uniform confidence bands for the dyadic density estimand $f_W$,
-and also developed a substantive application of our theory to counterfactual
-dyadic density analysis. We gave some other statistical applications of our
-methodology as well as potential avenues for future research. From a technical
-perspective, Appendix~\ref{app:kernel} contains several generic results
-concerning strong approximation methods and maximal inequalities for empirical
-processes that may be of independent interest. Implementations of this
-chapter's methodology, along with replication files for the empirical results,
-are provided by a Julia package available at
-\github{wgunderwood/DyadicKDE.jl}.
-This work is based on \citet{cattaneo2024uniform},
-and has been presented by Cattaneo at
-the Columbia University Biostatistics Colloquium Seminar (2022)
-and the Georgia Institute of Technology Statistics Seminar (2022),
-by Feng at
-the Renmin University Econometrics Seminar (2022),
-the Xiamen University Symposium on Modern Statistics (2022),
-the Peking University Econometrics Seminar (2023),
-and the Asian Meeting of the Econometric Society
-in East and Southeast Asia, Singapore (2023),
-and by Underwood at the University of Illinois Statistics Seminar (2024),
-the University of Michigan Statistics Seminar (2024), and the University of
-Pittsburgh Statistics Seminar (2024).
-
-\chapter[Yurinskii's Coupling for Martingales]%
-{Yurinskii's Coupling \\ for Martingales}
-\label{ch:yurinskii}
-
-% abstract
-Yurinskii's coupling is a popular theoretical tool for non-asymptotic
-distributional analysis in mathematical statistics and applied probability,
-offering a Gaussian strong approximation with an explicit error bound under
-easily verified conditions. Originally stated in $\ell^2$-norm for sums of
-independent random vectors, it has recently been extended both to the
-$\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in
-$\ell^2$-norm, under some strong conditions. We present as our main result a
-Yurinskii coupling for approximate martingales in $\ell^p$-norm, under
-substantially weaker conditions than those previously imposed. Our formulation
-further allows for the coupling variable to follow a more general Gaussian
-mixture distribution, and we provide a novel third-order coupling method which
-gives tighter approximations in certain settings. We specialize our main result
-to mixingales, martingales, and independent data, and derive uniform Gaussian
-mixture strong approximations for martingale empirical processes. Substantive
-applications of our theory to nonparametric partitioning-based and local
-polynomial regression procedures are provided.
-
-\section{Introduction}
-
-Yurinskii's coupling \citep{yurinskii1978error} has proven to be an important
-theoretical tool for developing non-asymptotic distributional approximations in
-mathematical statistics and applied probability. For a sum $S$ of $n$
-independent zero-mean $d$-dimensional random vectors, this coupling technique
-constructs (on a suitably enlarged probability space) a zero-mean
-$d$-dimensional Gaussian vector $T$ with the same covariance matrix as $S$ and
-which is close to $S$ in probability, bounding the discrepancy $\|S-T\|$ as a
-function of $n$, $d$, the choice of the norm, and some features of the
-underlying distribution. See, for example, \citet[Chapter 10]{pollard2002user}
-for a textbook introduction.
-
-When compared to other coupling approaches, such as the celebrated Hungarian
-construction \citep{komlos1975approximation} or Zaitsev's coupling
-\citep{zaitsev1987estimates,zaitsev1987gaussian}, Yurinskii's approach stands
-out for its simplicity, robustness, and wider applicability, while also
-offering tighter couplings in some applications (see below for more discussion
-and examples). These features have led many scholars to use Yurinskii's
-coupling to study the distributional features of high-dimensional statistical
-procedures in a variety of settings, often with the end goal of developing
-uncertainty quantification or hypothesis testing methods. For example, in
-recent years, Yurinskii's coupling has been used to construct Gaussian
-approximations for the suprema of empirical processes
-\citep{chernozhukov2014gaussian}; to establish distribution theory for
-non-Donsker stochastic $t$-processes generated in nonparametric series
-regression \citep{belloni2015some}; to prove distributional approximations for
-high-dimensional $\ell^p$-norms \citep{biau2015high}; to develop distribution
-theory for vector-valued martingales \citep{belloni2018high,li2020uniform}; to
-derive a law of the iterated logarithm for stochastic gradient descent
-optimization methods \citep{anastasiou2019normal}; to establish uniform
-distributional results for nonparametric high-dimensional quantile processes
-\citep{belloni2019conditional}; to develop distribution theory for non-Donsker
-stochastic $t$-processes generated in partitioning-based series regression
-\citep{cattaneo2020large}; to deduce Bernstein--von Mises theorems in
-high-dimensional settings \citep{ray2021bernstein}; and to develop distribution
-theory for non-Donsker U-processes based on dyadic network data
-\citep{cattaneo2024uniform}. There are also many other early applications of
-Yurinskii's coupling: \citet{dudley1983invariance} and \citet{dehling1983limit}
-establish invariance principles for Banach space-valued random variables, and
-\citet{lecam1988} and \citet{sheehy1992uniform} obtain uniform Donsker results
-for empirical processes, to name just a few.
-
-This chapter presents a new Yurinskii coupling which encompasses and improves
-upon all of the results previously available in the literature, offering four
-new features:
-%
-\begin{enumerate}[label=(\roman*),leftmargin=*]
-  \item
-    \label{it:yurinskii_contribution_approximate_martingale}
-    It applies to vector-valued \textit{approximate martingale} data.
-  \item
-    \label{it:yurinskii_contribution_gaussian_mixture}
-    It allows for a \textit{Gaussian mixture} coupling distribution.
-  \item
-    \label{it:yurinskii_contribution_degeneracy}
-    It imposes \textit{no restrictions on degeneracy} of the
-    data covariance matrix.
-  \item
-    \label{it:yurinskii_contribution_third_order}
-    It establishes a \textit{third-order} coupling to
-    improve the approximation in certain situations.
-\end{enumerate}
-%
-
-Closest to our work are the unpublished manuscript by \citet{belloni2018high}
-and the recent paper by \citet{li2020uniform}, which both investigated
-distribution theory for martingale data using Yurinskii's coupling and related
-methods. Specifically, \citet{li2020uniform} established a Gaussian
-$\ell^2$-norm Yurinskii coupling for mixingales and martingales under the
-assumption that the covariance structure has a minimum eigenvalue bounded away
-from zero. As formally demonstrated in this chapter
-(Section~\ref{sec:yurinskii_kde}),
-such eigenvalue assumptions can be prohibitively strong in practically relevant
-applications. In contrast, our Yurinskii coupling does not impose any
-restrictions on covariance degeneracy
-\ref{it:yurinskii_contribution_degeneracy}, in
-addition to offering several other new features not present in
-\citet{li2020uniform}, including
-\ref{it:yurinskii_contribution_approximate_martingale},
-\ref{it:yurinskii_contribution_gaussian_mixture},
-\ref{it:yurinskii_contribution_third_order}, and
-applicability to general $\ell^p$-norms. In addition, we correct a slight
-technical inaccuracy in their proof relating to the derivation of bounds in
-probability (Remark \ref{rem:yurinskii_coupling_bounds_probability}).
-\citet{belloni2018high} did not establish a Yurinskii coupling for martingales,
-but rather a central limit theorem for smooth functions of high-dimensional
-martingales using the celebrated second-order Lindeberg method
-\citep[see][and references therein]{chatterjee2006generalization}, explicitly
-accounting for covariance degeneracy. As a consequence, their result could be
-leveraged to deduce a Yurinskii coupling for martingales with additional,
-non-trivial technical work (see Section~\ref{sec:yurinskii_app_proofs}
-in Appendix~\ref{app:yurinskii} for details).
-Nevertheless, a Yurinskii coupling derived from
-\citet{belloni2018high} would not feature
-\ref{it:yurinskii_contribution_approximate_martingale},
-\ref{it:yurinskii_contribution_gaussian_mixture},
-\ref{it:yurinskii_contribution_third_order}, or
-general $\ell^p$-norms, as our results do. We discuss further the connections
-between our work and the related literature in the upcoming sections, both when
-introducing our main theoretical results and when presenting the examples and
-statistical applications.
-
-The most general coupling result of this chapter
-(Theorem~\ref{thm:yurinskii_sa_dependent}) is presented in
-Section~\ref{sec:yurinskii_main_results}, where we also specialize it to a
-slightly
-weaker yet more user-friendly formulation
-(Proposition~\ref{pro:yurinskii_sa_simplified}). Our Yurinskii coupling for
-approximate
-martingales is a strict generalization of all previous Yurinskii couplings
-available in the literature, offering a Gaussian mixture strong approximation
-for approximate martingale vectors in $\ell^p$-norm, with an improved rate of
-approximation when the third moments of the data are negligible, and with no
-assumptions on the spectrum of the data covariance matrix. A key technical
-innovation underlying the proof of Theorem~\ref{thm:yurinskii_sa_dependent} is
-that we
-explicitly account for the possibility that the minimum eigenvalue of the
-variance may be zero, or its lower bound may be unknown, with the argument
-proceeding using a carefully tailored regularization. Establishing a coupling
-to a Gaussian mixture distribution is achieved by an appropriate conditioning
-argument, leveraging a conditional version of Strassen's theorem established by
-\citet{chen2020jackknife}, along with some related technical work detailed in
-Section~\ref{sec:yurinskii_app_proofs}.
-A third-order coupling is obtained via
-a modification of a standard smoothing technique for Borel sets from classical
-versions of Yurinskii's coupling, enabling improved approximation errors
-whenever third moments are negligible.
-
-In Proposition~\ref{pro:yurinskii_sa_simplified}, we explicitly tune the
-parameters of
-the aforementioned regularization to obtain a simpler, parameter-free version
-of Yurinskii's coupling for approximate martingales, again offering Gaussian
-mixture coupling distributions and an improved third-order approximation error.
-This specialization of our main result takes an agnostic approach to potential
-singularities in the data covariance matrix and, as such, may be improved in
-specific applications where additional knowledge of the covariance structure is
-available. Section~\ref{sec:yurinskii_main_results} also presents some further
-refinements when additional structure is imposed, deriving Yurinskii couplings
-for mixingales, martingales, and independent data as
-Corollaries~\ref{cor:yurinskii_sa_mixingale},
-\ref{cor:yurinskii_sa_martingale}, and
-\ref{cor:yurinskii_sa_indep}, respectively. We take the opportunity to discuss
-and correct
-in Remark~\ref{rem:yurinskii_coupling_bounds_probability} a technical issue
-which is
-often neglected \citep{pollard2002user, li2020uniform} when using Yurinskii's
-coupling to derive bounds in probability. Section~\ref{sec:yurinskii_factor}
-presents a
-stylized example portraying the relevance of our main technical results in the
-context of canonical factor models, illustrating the importance of each of our
-new Yurinskii coupling features
-\ref{it:yurinskii_contribution_approximate_martingale}--%
-\ref{it:yurinskii_contribution_third_order}.
-
-Section~\ref{sec:yurinskii_emp_proc} considers a substantive application of our
-main
-results: strong approximation of martingale empirical processes. We begin with
-the motivating example of canonical kernel density estimation, demonstrating
-how Yurinskii's coupling can be applied, and showing in
-Lemma~\ref{lem:yurinskii_kde_eigenvalue} why it is essential that we do not
-place any
-conditions on the minimum eigenvalue of the variance matrix
-\ref{it:yurinskii_contribution_degeneracy}.
-We then present a general-purpose strong
-approximation for martingale empirical processes in
-Proposition~\ref{pro:yurinskii_emp_proc}, combining classical results in the
-empirical
-process literature \citep{van1996weak} with our
-Corollary~\ref{cor:yurinskii_sa_martingale}. This statement appears to be the
-first of
-its kind for martingale data, and when specialized to independent
-(and not necessarily identically distributed) data, it is
-shown to be superior to the best known comparable strong approximation result
-available in the literature \citep{berthet2006revisiting}. Our improvement
-comes from using Yurinskii's coupling for the $\ell^\infty$-norm, where
-\citet{berthet2006revisiting} apply Zaitsev's coupling
-\citep{zaitsev1987estimates, zaitsev1987gaussian} with the larger
-$\ell^2$-norm.
-
-Section~\ref{sec:yurinskii_nonparametric} further illustrates the applicability
-of our
-results through two examples in nonparametric regression estimation. Firstly,
-we deduce a strong approximation for partitioning-based least squares series
-estimators with time series data, applying
-Corollary~\ref{cor:yurinskii_sa_martingale}
-directly and additionally imposing only a mild mixing condition on the
-regressors. We show that our Yurinskii coupling for martingale vectors delivers
-the same distributional approximation rate as the best known result for
-independent data, and discuss how this can be leveraged to yield a feasible
-statistical inference procedure. We also show that if the residuals have
-vanishing conditional third moment, an improved rate of Gaussian approximation
-can be established. Secondly, we deduce a strong approximation for local
-polynomial estimators with time series data,
-using our result on martingale empirical processes
-(Proposition~\ref{pro:yurinskii_emp_proc}) and again imposing a mixing
-assumption.
-Appealing to empirical process theory is essential here as, in contrast with
-series estimators, local polynomials do not possess certain additive
-separability properties. The bandwidth restrictions we require are relatively
-mild, and, as far as we know, they have not been improved upon even with
-independent data.
-
-Section \ref{sec:yurinskii_conclusion} concludes the chapter.
-All proofs are collected in
-Appendix~\ref{app:yurinskii}, which also includes other technical lemmas
-of potential independent interest, alongside some further results on
-applications of our theory to deriving high-dimensional central limit theorems
-for martingales in Section~\ref{sec:yurinskii_app_high_dim_clt}.
-
-\subsection{Notation}
-
-We write $\|x\|_p$ for $p\in[1,\infty]$ to denote the $\ell^p$-norm if $x$ is a
-(possibly random) vector or the induced operator $\ell^p$--$\ell^p$-norm if $x$
-is a matrix. For $X$ a real-valued random variable and an Orlicz function
-$\psi$, we use $\vvvert X \vvvert_\psi$ to denote the Orlicz $\psi$-norm
-\citep[Section~2.2]{van1996weak} and $\vvvert X \vvvert_p$
-for the $L^p(\P)$-norm where
-$p\in [1,\infty]$. For a matrix $M$, we write $\|M\|_{\max}$ for the
-maximum absolute entry and $\|M\|_\rF$ for the Frobenius norm. We denote
-positive semi-definiteness by $M \succeq 0$ and write $I_d$ for the $d \times
-d$ identity matrix.
-
-For scalar sequences $x_n$ and $y_n$, we write $x_n \lesssim y_n$ if there
-exists a positive constant $C$ such that $|x_n| \leq C |y_n|$ for sufficiently
-large $n$. We write $x_n \asymp y_n$ to indicate both $x_n \lesssim y_n$ and
-$y_n \lesssim x_n$. Similarly, for random variables $X_n$ and $Y_n$, we write
-$X_n \lesssim_\P Y_n$ if for every $\varepsilon > 0$ there exists a positive
-constant $C$ such that $\P(|X_n| \leq C |Y_n|) \leq \varepsilon$, and write
-$X_n \to_\P X$ for limits in probability. For real numbers $a$ and $b$ we use
-$a \vee b = \max\{a,b\}$. We write $\kappa \in \N^d$ for a multi-index, where
-$d \in \N = \{0, 1, 2, \ldots\}$, and define $|\kappa| = \sum_{j=1}^d \kappa_j$
-and $x^\kappa = \prod_{j=1}^d x_j^{\kappa_j}$ for $x \in \R^d$,
-and $\kappa! = \prod_{j=1}^{d} \kappa_j !$.
-
-Since our results concern couplings, some statements must be made on a new or
-enlarged probability space. We omit the details of this for clarity of
-notation, but technicalities are handled by the Vorob'ev--Berkes--Philipp
-Theorem~\citep[Theorem~1.1.10]{dudley1999uniform}.
-
-\section{Main results}
-\label{sec:yurinskii_main_results}
-
-We begin with our most general result: an $\ell^p$-norm Yurinskii coupling of a
-sum of vector-valued approximate martingale differences to a Gaussian
-mixture-distributed random vector. The general result is presented in
-Theorem~\ref{thm:yurinskii_sa_dependent}, while
-Proposition~\ref{pro:yurinskii_sa_simplified} gives
-a simplified and slightly weaker version which is easier to use in
-applications. We then further specialize
-Proposition~\ref{pro:yurinskii_sa_simplified} to
-three scenarios with successively stronger assumptions, namely mixingales,
-martingales, and independent data in
-Corollaries~\ref{cor:yurinskii_sa_mixingale},
-\ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep}
-respectively. In each case we
-allow for possibly random quadratic variations (cf.\ mixing convergence),
-thereby establishing a Gaussian mixture coupling in the general setting. In
-Remark~\ref{rem:yurinskii_coupling_bounds_probability} we comment on and
-correct an often
-overlooked technicality relating to the derivation of bounds in probability
-from Yurinskii's coupling. As a first illustration of the power of our
-generalized $\ell^p$-norm Yurinskii coupling, we present in
-Section~\ref{sec:yurinskii_factor} a simple factor model example relating to
-all three of the aforementioned scenarios.
-
-\begin{theorem}[Strong approximation for vector-valued approximate martingales]
-  \label{thm:yurinskii_sa_dependent}
-
-  Take a complete probability space with a countably generated filtration
-  $\cH_0, \ldots, \cH_n$ for $n \geq 1$, supporting the $\R^d$-valued
-  square-integrable variables $X_1, \ldots, X_n$.
-  Let $S = \sum_{i=1}^n X_i$ and define
-  %
-  \begin{align*}
-    \tilde X_i
-    &= \sum_{r=1}^n \big(\E[X_{r} \mid \cH_{i}] - \E[X_{r} \mid \cH_{i-1}]\big)
-    & &\text{and}
-    &U &= \sum_{i=1}^{n} \big( X_i - \E[ X_i \mid \cH_n]
-    + \E[ X_i \mid \cH_0 ] \big).
-  \end{align*}
-  %
-  Let $V_i = \Var[\tilde X_i \mid \cH_{i-1}]$ and
-  define $\Omega = \sum_{i=1}^n V_i - \Sigma$
-  where $\Sigma$ is an almost surely positive semi-definite $\cH_0$-measurable
-  $d \times d$ matrix. Then, for each $\eta > 0$ and $p \in [1,\infty]$,
-  there exists, on an enlarged probability space, an $\R^d$-valued random
-  vector $T$ with $T \mid \cH_0 \sim \cN(0, \Sigma)$ and
-  %
-  \begin{align}
-    \label{eq:yurinskii_sa_dependent}
-    \P\big(\|S-T\|_p > 6\eta\big)
-    &\leq
-    \inf_{t>0}
-    \left\{
-      2 \P\big( \|Z\|_p > t \big)
-      + \min\left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4}
-        + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\} \nonumber \\
-    &\quad+
-    \inf_{M \succeq 0}
-    \Big\{ 2 \P\big(\Omega \npreceq M\big) + \delta_p(M,\eta)
-    + \varepsilon_p(M, \eta)\Big\}
-    +\P\big(\|U\|_p>\eta\big),
-  \end{align}
-  %
-  where $Z, Z_1,\dots ,Z_n$ are i.i.d.\ standard Gaussian random variables on
-  $\R^d$ independent of $\cH_n$, the second infimum is taken over all positive
-  semi-definite $d \times d$ non-random matrices $M$,
-  %
-  \begin{align*}
-    \beta_{p,k}
-    &=
-    \sum_{i=1}^n \E\left[\| \tilde X_i \|^k_2 \| \tilde X_i \|_p
-    + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right],
-    &\pi_3
-    &=
-    \sum_{i=1}^{n}
-    \sum_{|\kappa| = 3}
-    \E \Big[ \big|
-      \E [ \tilde X_i^\kappa \mid \cH_{i-1} ]
-    \big| \Big]
-  \end{align*}
-  %
-  for $k \in \{2, 3\}$, with $\pi_3 = \infty$ if the associated
-  conditional expectation does not exist, and with
-  %
-  \begin{align*}
-    \delta_p(M,\eta)
-    &=
-    \P\left(
-      \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
-      \geq \eta
-    \right), \\
-    \varepsilon_p(M, \eta)
-    &=
-    \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
-    \Omega \preceq M\right).
-  \end{align*}
-\end{theorem}
-
-This theorem offers four novel contributions to the literature on coupling
-theory and strong approximation, as discussed in the introduction.
-% approximate martingales
-Firstly \ref{it:yurinskii_contribution_approximate_martingale}, it allows for
-approximate
-vector-valued martingales, with the variables $\tilde X_i$ forming martingale
-differences with respect to $\cH_i$ by construction, and $U$ quantifying the
-associated martingale approximation error. Such martingale approximation
-techniques for sequences of dependent random vectors are well established and
-have been used in a range of scenarios: see, for example,
-\citet{wu2004martingale}, \citet{dedecker2007weak}, \citet{zhao2008martingale},
-\citet{peligrad2010conditional}, \citet{atchade2014martingale},
-\citet{cuny2014martingale}, \citet{magda2018martingale}, and references
-therein. In Section~\ref{sec:yurinskii_mixingales} we demonstrate how this
-approximation
-can be established in practice by restricting our general theorem to the
-special case of mixingales, while the upcoming example in
-Section~\ref{sec:yurinskii_factor} provides an illustration in the context of
-auto-regressive factor models.
-
-% Gaussian mixture
-Secondly \ref{it:yurinskii_contribution_gaussian_mixture},
-Theorem~\ref{thm:yurinskii_sa_dependent} allows for the
-resulting coupling variable $T$
-to follow a multivariate Gaussian distribution only conditionally,
-and thus we offer a useful analog of mixing convergence in the context
-of strong approximation.
-To be more precise, the random matrix $\sum_{i=1}^{n} V_i$
-is the quadratic variation of the constructed martingale
-$\sum_{i=1}^n \tilde X_i$, and we approximate it using the $\cH_0$-measurable
-random matrix $\Sigma$. This yields the coupling variable
-$T \mid \cH_0 \sim \cN(0, \Sigma)$, which can alternatively be written as
-$T=\Sigma^{1/2} Z$ with $Z \sim \cN(0,I_d)$ independent of $\cH_0$.
-The errors in this quadratic variation
-approximation are accounted for by the terms
-$\P(\Omega \npreceq M)$, $\delta_p(M, \eta)$, and $\varepsilon_p(M, \eta)$,
-utilizing a regularization argument through the free matrix parameter $M$.
-If a non-random $\Sigma$ is used, then $T$ is unconditionally Gaussian,
-and one can take $\cH_0$ to be the trivial $\sigma$-algebra.
-As demonstrated in our proof, our approach to establishing a
-mixing approximation is different from naively taking an unconditional version
-of Yurinskii's coupling and applying
-it conditionally on $\cH_0$, which will not deliver the same coupling as in
-Theorem~\ref{thm:yurinskii_sa_dependent} for a few reasons.
-To begin with, we explicitly indicate in the
-conditions of Theorem~\ref{thm:yurinskii_sa_dependent} where conditioning is
-required.
-Next, our error of approximation is given unconditionally,
-involving only marginal expectations and probabilities.
-Finally, we provide a rigorous account of the construction of the
-conditionally Gaussian coupling variable $T$ via a conditional version
-of Strassen's theorem \citep{chen2020jackknife}.
-Section~\ref{sec:yurinskii_martingales}
-illustrates how a strong approximation akin to
-mixing convergence can arise when the data
-forms an exact martingale, and Section~\ref{sec:yurinskii_factor} gives a
-simple example
-relating to factor modeling in statistics and data science.
-
-% remove lower bound on minimum eigenvalue
-As a third contribution to the literature
-\ref{it:yurinskii_contribution_degeneracy}, and
-of particular importance for applications,
-Theorem~\ref{thm:yurinskii_sa_dependent} makes
-no requirements on the minimum eigenvalue of the quadratic variation of the
-approximating martingale sequence. Instead, our proof technique employs a
-careful regularization scheme designed to account for any such exact or
-approximate rank degeneracy in $\Sigma$. This capability is fundamental in some
-applications, a fact which we illustrate in Section \ref{sec:yurinskii_kde} by
-demonstrating the significant improvements in strong approximation errors
-delivered by Theorem~\ref{thm:yurinskii_sa_dependent} relative to those
-obtained using
-prior results in the literature.
-
-% matching third moments
-Finally \ref{it:yurinskii_contribution_third_order},
-Theorem~\ref{thm:yurinskii_sa_dependent} gives
-a third-order strong approximation alongside the usual second-order
-version considered in all prior literature.
-More precisely, we observe that an analog of the term
-$\beta_{p,2}$ is present in the
-classical Yurinskii coupling and comes from a Lindeberg
-telescoping sum argument,
-replacing random variables by Gaussians with the same mean
-and variance to match the first and second moments.
-Whenever the third moments of $\tilde X_i$ are negligible
-(quantified by $\pi_3$), this moment-matching argument can be extended to
-third-order terms, giving a new term $\beta_{p,3}$.
-In certain settings, such as when the data is symmetrically
-distributed around zero, using $\beta_{p,3}$ rather than $\beta_{p,2}$
-can give smaller approximation errors in the coupling given in
-\eqref{eq:yurinskii_sa_dependent}.
-Such a refinement can be viewed as a strong approximation counterpart
-to classical Edgeworth expansion methods.
-We illustrate this phenomenon in our
-upcoming applications to nonparametric inference
-(Section~\ref{sec:yurinskii_nonparametric}).
-
-\subsection{User-friendly formulation of the main result}%
-
-The result in Theorem~\ref{thm:yurinskii_sa_dependent} is given in a somewhat
-implicit
-manner, involving infima over the free parameters $t > 0$ and $M \succeq 0$,
-and it is not clear how to compute these in general. In the upcoming
-Proposition~\ref{pro:yurinskii_sa_simplified}, we set $M = \nu^2 I_d$ and
-approximately
-optimize over $t > 0$ and $\nu > 0$, resulting in a simplified and slightly
-weaker version of our main general result. In specific applications, where
-there is additional knowledge of the quadratic variation structure, other
-choices of regularization schemes may be more appropriate. Nonetheless, the
-choice $M = \nu^2 I_d$ leads to arguably the principal result of our work,
-due to its simplicity and utility in statistical applications. For convenience,
-define the functions $\phi_p : \N \to \R$ for $p \in [0, \infty]$,
-%
-\begin{align*}
-  \phi_p(d) =
-  \begin{cases}
-    \sqrt{pd^{2/p} } & \text{ if } p \in [1,\infty), \\
-    \sqrt{2\log 2d} & \text{ if } p =\infty,
-  \end{cases}
-\end{align*}
-%
-which are related to tail probabilities
-of the $\ell^p$-norm of a standard Gaussian.
-
-\begin{proposition}[Simplified strong approximation
-  for approximate martingales]%
-  \label{pro:yurinskii_sa_simplified}
-
-  Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}.
-  For each $\eta > 0$ and $p \in [1,\infty]$,
-  there exists a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-    \right)^{1/3}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    +\P\left(\|U\|_p>\frac{\eta}{6}\right).
-  \end{align*}
-  %
-  If further $\pi_3 = 0$ then
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
-    \right)^{1/4}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    +\P\left(\|U\|_p>\frac{\eta}{6}\right).
-  \end{align*}
-  %
-\end{proposition}
-
-Proposition~\ref{pro:yurinskii_sa_simplified} makes clear the potential benefit
-of a
-third-order coupling when $\pi_3 = 0$, as in this case the bound features
-$\beta_{p,3}^{1/4}$ rather than $\beta_{p,2}^{1/3}$. If $\pi_3$ is small but
-non-zero, an analogous result can easily be derived by adjusting the optimal
-choices of $t$ and $\nu$, but we omit this for clarity of notation. In
-applications (see Section~\ref{sec:yurinskii_series}), this reduction of the
-exponent can
-provide a significant improvement in terms of the dependence of the bound on
-the sample size $n$, the dimension $d$, and other problem-specific quantities.
-When using our results for strong approximation, it is usual to set
-$p = \infty$ to bound the maximum discrepancy over the entries of a vector (to
-construct uniform confidence sets, for example). In this setting, we have that
-$\phi_\infty(d) = \sqrt{2 \log 2d}$ has a sub-Gaussian slow-growing dependence
-on the dimension. The remaining term depends on $\E[\|\Omega\|_2]$ and requires
-that the matrix $\Sigma$ be a good approximation of $\sum_{i=1}^{n} V_i$, while
-remaining $\cH_0$-measurable. In some applications (such as factor modeling;
-see Section~\ref{sec:yurinskii_factor}), it can be shown that the quadratic
-variation
-$\sum_{i=1}^n V_i$ remains random and $\cH_0$-measurable even in large samples,
-giving a natural choice for $\Sigma$.
-
-In the next few sections, we continue to refine
-Proposition~\ref{pro:yurinskii_sa_simplified}, presenting a sequence of results
-with
-increasingly strict assumptions on the dependence structure of the data $X_i$.
-These allow us to demonstrate the broad applicability of our main results,
-providing more explicit bounds in settings which are likely to be of special
-interest. In particular, we consider mixingales, martingales, and independent
-data, comparing our derived results with those in the existing literature.
-
-\subsection{Mixingales}
-\label{sec:yurinskii_mixingales}
-
-In our first refinement, we provide a natural method for bounding the
-martingale approximation error term $U$. Suppose that $X_i$ form an
-$\ell^p$-mixingale in $L^1(\P)$ in the sense that there exist non-negative
-$c_1, \ldots, c_n$ and $\zeta_0, \ldots, \zeta_n$ such that for all
-$1 \leq i \leq n$ and $0 \leq r \leq i$,
-%
-\begin{align}
-  \label{eq:yurinskii_mixingale_1}
-  \E \left[ \left\|
-    \E \left[ X_i \mid \cH_{i-r} \right]
-  \right\|_p \right]
-  &\leq
-  c_i \zeta_r,
-\end{align}
-%
-and for all $1 \leq i \leq n$ and $0 \leq r \leq n-i$,
-%
-\begin{align}
-  \label{eq:yurinskii_mixingale_2}
-  \E \left[ \big\|
-    X_i - \E \big[ X_i \mid \cH_{i+r} \big]
-  \big\|_p \right]
-  &\leq
-  c_i \zeta_{r+1}.
-\end{align}
-%
-These conditions are satisfied, for example, if $X_i$ are integrable strongly
-$\alpha$-mixing random variables \citep{mcleish1975invariance}, or if $X_i$ are
-generated by an auto-regressive or auto-regressive moving average process (see
-Section~\ref{sec:yurinskii_factor}), among many other possibilities
-\citep{bradley2005basic}. Then, in the notation of
-Theorem~\ref{thm:yurinskii_sa_dependent}, we have by Markov's inequality that
-%
-\begin{align*}
-  \P \left( \|U\|_p > \frac{\eta}{6} \right)
-  &\leq
-  \frac{6}{\eta}
-  \sum_{i=1}^{n}
-  \E \left[
-    \big\|
-    X_i - \E \left[ X_i \mid \cH_n \right]
-    \big\|_p
-    + \big\|
-    \E \left[ X_i \mid \cH_0 \right]
-    \big\|_p
-  \right]
-  \leq \frac{\zeta}{\eta},
-\end{align*}
-%
-with $\zeta = 6 \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$.
-Combining Proposition~\ref{pro:yurinskii_sa_simplified} with this
-martingale error bound yields the following result for mixingales.
-%
-\begin{corollary}[Strong approximation for vector-valued mixingales]%
-  \label{cor:yurinskii_sa_mixingale}
-
-  Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent},
-  and suppose
-  the mixingale conditions \eqref{eq:yurinskii_mixingale_1} and
-  \eqref{eq:yurinskii_mixingale_2} hold. For each $\eta > 0$ and
-  $p \in [1,\infty]$ there
-  is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-    \right)^{1/3}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    + \frac{\zeta}{\eta}.
-  \end{align*}
-  %
-  If further $\pi_3 = 0$ then
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
-    \right)^{1/4}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    + \frac{\zeta}{\eta}.
-  \end{align*}
-  %
-\end{corollary}
-
-The closest antecedent to Corollary~\ref{cor:yurinskii_sa_mixingale} is found in
-\citet[Theorem~4]{li2020uniform}, who also considered Yurinskii's coupling for
-mixingales. Our result improves on this work in the following manner: it
-removes any requirements on the minimum eigenvalue of the quadratic variation
-of the mixingale sequence; it allows for general $\ell^p$-norms with
-$p\in[1,\infty]$; it establishes a coupling to a multivariate Gaussian
-mixture distribution in general; and it permits third-order couplings
-(when $\pi_3=0$). These improvements have important practical implications as
-demonstrated in Sections \ref{sec:yurinskii_factor} and
-\ref{sec:yurinskii_nonparametric},
-where significantly better coupling approximation
-errors are demonstrated for a variety of statistical applications. On the
-technical side, our result is rigorously established using a conditional
-version of Strassen's theorem \citep{chen2020jackknife}, a carefully crafted
-regularization argument, and a third-order Lindeberg method
-\citep[see][and references therein, for more discussion on the
-standard second-order Lindeberg method]{chatterjee2006generalization}.
-Furthermore, as explained in
-Remark~\ref{rem:yurinskii_coupling_bounds_probability}, we
-clarify a technical issue in \citet{li2020uniform} surrounding the derivation
-of valid probability bounds for $\|S-T\|_p$.
-
-Corollary~\ref{cor:yurinskii_sa_mixingale} focused on mixingales for
-simplicity, but, as
-previously discussed, any method for constructing a martingale approximation
-$\tilde X_i$ and bounding the resulting error $U$ could be used instead in
-Proposition~\ref{pro:yurinskii_sa_simplified} to derive a similar result.
-
-\subsection{Martingales}
-\label{sec:yurinskii_martingales}
-
-For our second refinement, suppose that
-$X_i$ form martingale differences with respect to $\cH_i$.
-In this case, $\E[X_i \mid \cH_n] = X_i$ and $\E[X_i \mid \cH_0] = 0$,
-so $U = 0$, and the martingale approximation error term vanishes.
-Applying Proposition~\ref{pro:yurinskii_sa_simplified} in this setting
-directly yields the following result.
-%
-\begin{corollary}[Strong approximation for vector-valued martingales]%
-  \label{cor:yurinskii_sa_martingale}
-
-  With the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent},
-  suppose that
-  $X_i$ is $\cH_i$-measurable satisfying $\E[X_i \mid \cH_{i-1}] = 0$ for
-  $1 \leq i \leq n$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there is
-  a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
-  %
-  \begin{align}
-    \label{eq:yurinskii_sa_martingale_order_2}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-    \right)^{1/3}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}.
-  \end{align}
-  %
-  If further $\pi_3 = 0$ then
-  %
-  \begin{align}
-    \label{eq:yurinskii_sa_martingale_order_3}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
-    \right)^{1/4}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}.
-  \end{align}
-  %
-\end{corollary}
-
-The closest antecedents to Corollary~\ref{cor:yurinskii_sa_martingale} are
-\citet{belloni2018high} and \citet{li2020uniform}, who also implicitly or
-explicitly considered Yurinskii's coupling for martingales. More specifically,
-\citet[Theorem~1]{li2020uniform} established an explicit
-$\ell^2$-norm Yurinskii coupling
-for martingales under a strong assumption on the minimum eigenvalue of the
-martingale quadratic variation, while \citet[Theorem~2.1]{belloni2018high}
-established a central limit theorem for vector-valued martingale sequences
-employing the standard second-order Lindeberg method, implying that their proof
-could be adapted to deduce a Yurinskii coupling for martingales with the help
-of a conditional version of Strassen's theorem \citep{chen2020jackknife} and
-some additional nontrivial technical work.
-
-Corollary~\ref{cor:yurinskii_sa_martingale} improves over this prior work as
-follows.
-With respect to \citet{li2020uniform}, our result establishes an $\ell^p$-norm
-Gaussian mixture Yurinskii coupling for martingales without any requirements on
-the minimum eigenvalue of the martingale quadratic variation, and permits a
-third-order coupling if $\pi_3=0$. The first probability bound
-\eqref{eq:yurinskii_sa_martingale_order_2} in
-Corollary~\ref{cor:yurinskii_sa_martingale} gives the
-same rate of strong approximation as that in Theorem~1 of \citet{li2020uniform}
-when $p=2$, with non-random $\Sigma$, and when the eigenvalues of a normalized
-version of $\Sigma$ are bounded away from zero. In
-Section~\ref{sec:yurinskii_kde} we
-demonstrate the crucial importance of removing this eigenvalue lower bound
-restriction in applications involving nonparametric kernel estimators, while in
-Section~\ref{sec:yurinskii_series} we demonstrate how the availability of a
-third-order
-coupling \eqref{eq:yurinskii_sa_martingale_order_3} can give improved
-approximation rates
-in applications involving nonparametric series estimators with conditionally
-symmetrically distributed residual errors. Finally, our technical work improves
-on \citet{li2020uniform} in two respects:
-%
-\begin{inlineroman}
-  \item
-    we employ a conditional version
-    of Strassen's theorem (see Lemma~\ref{lem:yurinskii_app_strassen}
-    in the appendix)
-    to appropriately handle the conditioning arguments; and
-  \item
-    we deduce valid
-    probability bounds for $\|S-T\|_p$, as the following
-    Remark~\ref{rem:yurinskii_coupling_bounds_probability} makes clear.
-\end{inlineroman}
-
-\begin{remark}[Yurinskii's coupling and bounds in probability]
-  \label{rem:yurinskii_coupling_bounds_probability}
-  Given a sequence of random vectors $S_n$, Yurinskii's method provides a
-  coupling in the following form: for each $n$ and any $\eta > 0$, there exists
-  a random vector $T_n$ with $\P\big(\|S_n - T_n\| > \eta\big) < r_n(\eta)$,
-  where $r_n(\eta)$ is the approximation error. Crucially, each coupling
-  variable $T_n$ is a function of the desired approximation level $\eta$ and,
-  as such, deducing bounds in probability on $\|S_n - T_n\|$ requires some
-  extra care. One option is to select a sequence $R_n \to \infty$ and note that
-  $\P\big(\|S_n - T_n\| > r_n^{-1}(1 / R_n)\big) < 1 / R_n \to 0$ and hence
-  $\|S_n - T_n\| \lesssim_\P r_n^{-1}(1 / R_n)$. In this case, $T_n$ depends on
-  the choice of $R_n$, which can in turn typically be chosen to diverge slowly
-  enough to cause no issues in applications.
-\end{remark}
-
-Technicalities akin to those outlined in
-Remark~\ref{rem:yurinskii_coupling_bounds_probability} have been both addressed
-and
-neglected alike in the prior literature. \citet[Chapter 10.4, Example
-16]{pollard2002user} apparently misses this subtlety, providing an
-inaccurate bound in probability based on the Yurinskii coupling.
-\citet{li2020uniform} seem to make the same mistake in the proof of their
-Lemma~A2, which invalidates the conclusion of their Theorem~1. In contrast,
-\citet{belloni2015some} and \citet{belloni2019conditional} directly provide
-bounds in $o_\P$ instead of $O_\P$, circumventing these issues in a manner
-similar to our approach involving a diverging sequence $R_n$.
-
-To see how this phenomenon applies to our main results, observe that the
-second-order martingale coupling given as
-\eqref{eq:yurinskii_sa_martingale_order_2} in
-Corollary~\ref{cor:yurinskii_sa_martingale} implies that for any
-$R_n \to \infty$,
-%
-\begin{align*}
-  \|S - T\|_p
-  \lesssim_\P
-  \beta_{p,2}^{1/3}
-  \phi_p(d)^{2/3} R_n
-  + \E[\|\Omega\|_2]^{1/2}
-  \phi_p(d) R_n.
-\end{align*}
-%
-This bound is comparable to that obtained by \citet[Theorem~1]{li2020uniform}
-with $p=2$, albeit with their formulation missing the $R_n$ correction terms.
-In Section~\ref{sec:yurinskii_series} we discuss further their (amended)
-result, in the
-setting of nonparametric series estimation. Our approach using
-$p = \infty$ obtains superior distributional approximation rates, alongside
-exhibiting various other improvements such as the aforementioned third-order
-coupling.
-
-Turning to the comparison with \citet{belloni2018high}, our
-Corollary~\ref{cor:yurinskii_sa_martingale} again offers the same improvements,
-with the
-only exception being that the authors did account for the implications of a
-possibly vanishing minimum eigenvalue. However, their results exclusively
-concern high-dimensional central limit theorems for vector-valued martingales,
-and therefore while their findings
-could in principle enable the derivation of a result similar to our
-Corollary~\ref{cor:yurinskii_sa_martingale}, this would require additional
-technical work
-on their behalf in multiple ways
-(see Appendix~\ref{app:yurinskii}):
-%
-\begin{inlineroman}
-  \item a correct application of a conditional
-    version of Strassen's theorem
-    (Lemma~\ref{lem:yurinskii_app_strassen});
-  \item the development of a third-order Borel set smoothing technique and
-    associated $\ell^p$-norm moment control
-    (Lemmas \ref{lem:yurinskii_app_smooth_approximation},
-      \ref{lem:yurinskii_app_gaussian_useful},
-    and \ref{lem:yurinskii_app_gaussian_pnorm});
-  \item a careful truncation scheme to account for
-    $\Omega\npreceq0$; and
-  \item a valid third-order Lindeberg argument
-    (Lemma \ref{lem:yurinskii_app_sa_martingale}),
-    among others.
-\end{inlineroman}
-
-\subsection{Independence}
-
-As a final refinement, suppose that $X_i$ are independent and
-zero-mean conditionally on $\cH_0$,
-and take $\cH_i$ to be the filtration
-generated by $X_1, \ldots, X_i$ and $\cH_0$ for $1 \leq i \leq n$.
-Then, taking $\Sigma = \sum_{i=1}^n V_i$
-gives $\Omega = 0$, and hence Corollary~\ref{cor:yurinskii_sa_martingale}
-immediately yields the following result.
-%
-\begin{corollary}[Strong approximation for sums of independent vectors]%
-  \label{cor:yurinskii_sa_indep}
-
-  Take the setup of Theorem~\ref{thm:yurinskii_sa_dependent},
-  and let $X_i$ be independent given $\cH_0$,
-  with $\E[X_i \mid \cH_0] = 0$.
-  Then, for each $\eta > 0$ and $p \in [1,\infty]$,
-  with $\Sigma = \sum_{i=1}^n V_i$,
-  there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
-  %
-  \begin{align}
-    \label{eq:yurinskii_sa_indep_order_2}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}.
-  \end{align}
-  %
-  If further $\pi_3 = 0$ then
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4}.
-  \end{align*}
-  %
-\end{corollary}
-
-Taking $\cH_0$ to be trivial,
-\eqref{eq:yurinskii_sa_indep_order_2} provides an $\ell^p$-norm approximation
-analogous to that presented in \citet{belloni2019conditional}.
-By further
-restricting to $p=2$, we recover the original Yurinskii coupling as presented
-in \citet[Theorem~1]{lecam1988} and \citet[Theorem~10]{pollard2002user}. Thus,
-in the independent data setting, our result improves on prior work as follows:
-\begin{inlineroman}
-  \item
-    it establishes a coupling to a multivariate Gaussian mixture distribution;
-    and
-  \item
-    it permits a third-order coupling if $\pi_3=0$.
-\end{inlineroman}
-
-\subsection{Stylized example: factor modeling}
-\label{sec:yurinskii_factor}
-
-In this section, we present a simple statistical example of how our
-improvements over prior coupling results can have important theoretical and
-practical implications. Consider the stylized factor model
-%
-\begin{align*}
-  X_i = L f_i + \varepsilon_i, \qquad 1 \leq i \leq n,
-\end{align*}
-%
-with random variables $L$ taking values in $\R^{d \times m}$, $f_i$ in $\R^m$,
-and $\varepsilon_i$ in $\R^d$. We interpret $f_i$ as a latent factor variable
-and $L$ as a random factor loading, with idiosyncratic disturbances
-$\varepsilon_i$. See \citet{fan2020statistical}, and references therein, for a
-textbook review of factor analysis in statistics and econometrics.
-
-We employ the above factor model to give a first illustration of the
-applicability of our main result Theorem~\ref{thm:yurinskii_sa_dependent}, the
-user-friendly Proposition~\ref{pro:yurinskii_sa_simplified}, and their
-specialized
-Corollaries~\ref{cor:yurinskii_sa_mixingale}--\ref{cor:yurinskii_sa_indep}. We
-consider three different sets of conditions to demonstrate the applicability of
-each of our corollaries for mixingales, martingales, and independent data,
-respectively. We assume throughout that
-$(\varepsilon_1, \ldots, \varepsilon_n)$ is zero-mean and finite variance, and
-that $(\varepsilon_1, \ldots, \varepsilon_n)$ is independent
-of $L$ and $(f_1, \ldots, f_n)$. Let $\cH_i$ be the $\sigma$-algebra generated
-by $L$, $(f_1, \ldots, f_i)$, and $(\varepsilon_1, \ldots, \varepsilon_i)$, with
-$\cH_0$ the $\sigma$-algebra generated by $L$ alone.
-
-\begin{itemize}
-  \item \emph{Independent data}.
-    Suppose that the factors $(f_1, \ldots,
-    f_n)$ are independent conditional on $L$ and satisfy
-    $\E [ f_i \mid L ] = 0$.
-    Then, since $X_i$ are independent conditional on $\cH_0$ and with
-    $\E [ X_i \mid \cH_0 ] = \E [ L f_i + \varepsilon_i \mid L ] = 0$,
-    we can apply Corollary~\ref{cor:yurinskii_sa_indep} to $\sum_{i=1}^n X_i$.
-    In general, we will obtain a coupling variable which has the Gaussian
-    mixture distribution $T \mid \cH_0 \sim \cN(0, \Sigma)$ where
-    $\Sigma= \sum_{i=1}^n (L\Var[f_i \mid L]L^\T +\Var[\varepsilon_i])$.
-    In the special case where $L$ is non-random
-    and $\cH_0$ is trivial, the coupling is Gaussian. Further,
-    if $f_i\mid L$ and $\varepsilon_i$ are symmetric about zero
-    and bounded, then $\pi_3=0$, and the coupling is improved.
-
-  \item \emph{Martingales}.
-    Suppose instead that we assume only a martingale
-    condition on the latent factor variables so that
-    $\E \left[ f_i \mid L, f_1, \ldots, f_{i-1} \right] = 0$.
-    Then $\E [ X_i \mid \cH_{i-1} ]
-    = L\, \E \left[ f_i \mid \cH_{i-1} \right] = 0$
-    and Corollary~\ref{cor:yurinskii_sa_martingale} is applicable to
-    $\sum_{i=1}^n X_i$.
-    The preceding comments on Gaussian mixture distributions
-    and third-order couplings continue to apply.
-
-  \item \emph{Mixingales}.
-    Finally, assume that the factors follow the
-    auto-regressive model $f_i = A f_{i-1} + u_i$ where
-    $A \in \R^{m \times m}$ is non-random and $(u_1, \ldots, u_n)$ are
-    zero-mean, independent, and independent of
-    $(\varepsilon_1, \ldots, \varepsilon_n)$.
-    Then $\E \left[ f_i \mid f_0 \right] = A^i f_0$, so taking
-    $p \in [1, \infty]$ we see that
-    $\E \big[ \| \E [ f_i \mid f_0 ] \|_p \big]
-    = \E \big[ \| A^i f_0 \|_p \big] \leq \|A\|_p^i\,\E [ \|f_0\|_p ]$,
-    and that clearly $f_i - \E [ f_i \mid \cH_n ] = 0$.
-    Thus, whenever $\|A\|_p < 1$, the geometric sum formula implies that
-    we can apply the mixingale result from
-    Corollary~\ref{cor:yurinskii_sa_mixingale} to
-    $\sum_{i=1}^n X_i$. The conclusions on Gaussian mixture distributions
-    and third-order couplings parallel the previous cases.
-    %
-\end{itemize}
-
-This simple application to factor modeling gives a preliminary illustration of
-the power of our main results, encompassing settings which could not be handled
-by employing Yurinskii couplings available in the existing literature. Even
-with independent data, we offer new Yurinskii couplings to Gaussian mixture
-distributions (due to the presence of the common random factor loading $L$),
-which could be further improved whenever the factors and residuals possess
-symmetric (conditional) distributions. Furthermore, our results do not impose
-any restrictions on the minimum eigenvalue of $\Sigma$, thereby allowing for
-more general factor structures. These improvements are maintained in the
-martingale, mixingale, and weakly dependent stationary data settings.
-
-\section{Strong approximation for martingale empirical processes}%
-\label{sec:yurinskii_emp_proc}
-
-In this section, we demonstrate how our main results can be applied to some more
-substantive problems in statistics. Having until this point studied only
-finite-dimensional (albeit potentially high-dimensional) random vectors, we now
-turn our attention to infinite-dimensional stochastic processes. Specifically,
-we consider empirical processes of the form
-$S(f) = \sum_{i=1}^{n} f(X_i)$ for $f \in \cF$
-a problem-specific class of real-valued
-functions, where each $f(X_i)$ forms a martingale difference sequence with
-respect to an appropriate filtration. We construct (conditionally) Gaussian
-processes $T(f)$ for which an upper bound on the uniform coupling error
-$\sup_{f \in \cF} |S(f) - T(f)|$ is precisely quantified. We control the
-complexity of $\cF$ using metric entropy under Orlicz norms.
-
-The novel strong approximation results which we present concern the entire
-martingale empirical process $(S(f):f \in \cF)$, as opposed to just the scalar
-supremum of the empirical process, $\sup_{f \in \cF} |S(f)|$. This distinction
-has been carefully noted by \citet{chernozhukov2014gaussian}, who studied
-Gaussian approximation of empirical process suprema in the independent data
-setting and wrote (p.\ $1565$): ``A related but different problem is that of
-approximating \textit{whole} empirical processes by a sequence of Gaussian
-processes in the sup-norm. This problem is more difficult than
-[approximating the supremum of the empirical process].''
-Indeed, the results we establish in
-this section are for a strong approximation for the entire empirical process by
-a sequence of Gaussian mixture processes in the supremum norm, when the data
-has a martingale difference structure
-(cf.\ Corollary \ref{cor:yurinskii_sa_martingale}).
-Our results can be further generalized to approximate martingale
-empirical processes (cf.\ Corollary \ref{cor:yurinskii_sa_mixingale}), but we
-do not
-consider this extension to reduce notation and the technical burden.
-
-\subsection{Motivating example: kernel density estimation}
-\label{sec:yurinskii_kde}
-
-We begin with a brief study of a canonical example of an empirical process
-which is non-Donsker (thus precluding the use of uniform central limit
-theorems) due to the presence of a function class whose complexity increases
-with the sample size: the kernel density estimator with i.i.d.\ scalar data.
-We give an overview of our general strategy for
-strong approximation of stochastic processes
-via discretization, and show explicitly in
-Lemma~\ref{lem:yurinskii_kde_eigenvalue}
-how it is crucial
-that we do not impose lower bounds on the eigenvalues of the discretized
-covariance matrix. Detailed calculations for this section are
-relegated to Appendix~\ref{app:yurinskii} for conciseness.
-
-Let $X_1, \ldots, X_n$ be i.i.d.\ $\Unif[0,1]$, take
-$K(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$ the Gaussian kernel and let
-$h \in (0,1]$ be a bandwidth. Then, for $a \in (0,1/4]$ and
-$x \in \cX = [a, 1-a]$ to avoid boundary issues, the kernel density estimator
-of the true density function $g(x) = 1$ is
-%
-\begin{align*}
-  \hat g(x)
-  &=
-  \frac{1}{n}
-  \sum_{i=1}^{n}
-  K_h( X_i - x),
-  \qquad K_h(u) = \frac{1}{h} K\left( \frac{u}{h} \right).
-\end{align*}
-%
-Consider establishing a strong approximation for the stochastic process
-$(\hat g(x)-\E [ \hat g(x) ] : x\in\cX)$
-which is, upon rescaling, non-Donsker whenever
-the bandwidth decreases to zero in large samples.
-To match notation with the upcoming
-general result for empirical processes, set
-$f_x(u) = \frac{1}{n} (K_h( u - x) - \E[K_h( X_i - x)])$
-so $S(x) \vcentcolon= S(f_x) = \hat g(x)-\E [ \hat g(x) ]$.
-The next step is standard: a
-mesh separates the local oscillations of the processes from
-the finite-dimensional coupling.
-For $\delta \in (0,1/2)$, set
-$N = \left\lfloor 1 + \frac{1 - 2a}{\delta} \right\rfloor$
-and $\cX_\delta = (a + (j-1)\delta : 1 \leq j \leq N)$.
-Letting $T(x)$ be the approximating stochastic
-process to be constructed, consider the decomposition
-%
-\begin{align*}
-  \sup_{x \in \cX}
-  \big|S(x) - T(x)\big|
-  &\leq
-  \sup_{|x-x'| \leq \delta}
-  \big|S(x) - S(x') \big|
-  + \max_{x \in \cX_\delta}
-  |S(x) - T(x)|
-  + \sup_{|x-x'| \leq \delta}
-  \big|T(x) - T(x')\big|.
-\end{align*}
-%
-Writing $S(\cX_\delta)$ for
-$\big(S(x) : x \in \cX_\delta\big)\in \mathbb{R}^N$,
-noting that this is a sum of i.i.d.\ random vectors, we apply
-Corollary~\ref{cor:yurinskii_sa_indep} as
-$\max_{x \in \cX_\delta} |S(x) - T(x)|
-= \| S(\cX_\delta) - T(\cX_\delta) \|_\infty$.
-We obtain that for each $\eta > 0$ there is a Gaussian vector
-$T(\cX_\delta)$ with the same covariance matrix as $S(\cX_\delta)$ satisfying
-%
-\begin{align*}
-  \P\left(
-    \|S(\cX_\delta) - T(\cX_\delta)\|_\infty > \eta
-  \right)
-  &\leq
-  31 \left(
-    \frac{N \log 2 N}{\eta^3 n^2 h^2}
-  \right)^{1/3}
-\end{align*}
-%
-assuming that $1/h \geq \log 2 N$.
-By the Vorob'ev--Berkes--Philipp theorem
-\citep[Theorem~1.1.10]{dudley1999uniform},
-$T(\cX_\delta)$ extends to a Gaussian process $T(x)$
-defined for all $x \in \cX$ and with the same covariance structure
-as $S(x)$.
-
-Next, chaining with the Bernstein--Orlicz and sub-Gaussian norms
-\citep[Section~2.2]{van1996weak} shows that if
-$\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$,
-%
-\begin{align*}
-  \sup_{|x-x'| \leq \delta}
-  \big\|S(x) - S(x') \big\|_\infty
-  &\lesssim_\P
-  \delta
-  \sqrt{\frac{\log n}{n h^3}} \ \quad\text{and}\quad
-  \sup_{|x-x'| \leq \delta}
-  \big\|T(x) - T(x')\big\|_\infty
-  \lesssim_\P
-  \delta
-  \sqrt{\frac{\log n}{n h^3}}.
-\end{align*}
-%
-Finally, for any $R_n\to\infty$
-(see Remark~\ref{rem:yurinskii_coupling_bounds_probability}),
-the resulting bound on the coupling error is
-%
-\begin{align*}
-  \sup_{x \in \cX}
-  \big| S(x) - T(x) \big|
-  &\lesssim_\P
-  \left( \frac{N \log 2N}{n^2 h^2} \right)^{1/3} R_n
-  + \delta \sqrt{\frac{\log n}{n h^3}},
-\end{align*}
-%
-where the mesh size $\delta$ can then be approximately
-optimized to obtain the tightest possible strong approximation.
-
-The discretization strategy outlined above is at the core of the proof strategy
-for our upcoming Proposition~\ref{pro:yurinskii_emp_proc}. Since we will
-consider
-martingale empirical processes, our proof will rely on
-Corollary~\ref{cor:yurinskii_sa_martingale}, which, unlike the martingale
-Yurinskii
-coupling established by \citet{li2020uniform}, does not require a lower bound
-on the minimum eigenvalue of $\Sigma$. Using the simple kernel density example
-just discussed, we now demonstrate precisely the crucial importance of removing
-such eigenvalue conditions. The following
-Lemma~\ref{lem:yurinskii_kde_eigenvalue} shows
-that the discretized covariance matrix $\Sigma = n h\Var[S(\cX_\delta)]$ has
-exponentially small eigenvalues, which in turn will negatively affect the
-strong approximation bound if the \citet{li2020uniform} coupling were to be
-used instead of the results in this dissertation.
-
-\begin{lemma}[Minimum eigenvalue of a
-  kernel density estimator covariance matrix]%
-  \label{lem:yurinskii_kde_eigenvalue}
-  %
-  The minimum eigenvalue of
-  $\Sigma=n h\Var[S(\cX_\delta)] \in \R^{N \times N}$
-  satisfies the upper bound
-  %
-  \begin{align*}
-    \lambda_{\min}(\Sigma)
-    &\leq
-    2 e^{-h^2/\delta^2}
-    + \frac{h}{\pi a \delta}
-    e^{-a^2 / h^2}.
-  \end{align*}
-\end{lemma}
-%
-Figure~\ref{fig:yurinskii_min_eig} shows how the upper bound in Lemma
-\ref{lem:yurinskii_kde_eigenvalue} captures the behavior of the simulated
-minimum
-eigenvalue of $\Sigma$. In particular, the smallest eigenvalue decays
-exponentially fast in the discretization level $\delta$ and the bandwidth $h$.
-As seen in the calculations above, the coupling rate depends on $\delta / h$,
-while the bias will generally depend on $h$, implying that both $\delta$ and
-$h$ must converge to zero to ensure valid statistical inference. In general,
-this will lead to $\Sigma$ possessing extremely small eigenvalues, rendering
-strong approximation approaches such as that of \citet{li2020uniform}
-ineffective in such scenarios.
-%
-\begin{figure}[t]
-  \centering
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/sim_2.pdf}
-    \caption{$h = 0.03$}
-  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \centering
-    \includegraphics[scale=0.64]{graphics/sim_1.pdf}
-    \caption{$h = 0.01$}
-  \end{subfigure}
-  \caption[Minimum eigenvalue of the kernel density covariance matrix]{
-    Upper bounds on the minimum eigenvalue of the discretized covariance
-    matrix in kernel density estimation,
-    with $n=100$ and $a = 0.2$.
-    Simulated: the kernel density estimator is simulated,
-    resampling the data $100$ times
-    to estimate its covariance.
-    Computing matrix: the minimum eigenvalue of the limiting covariance
-    matrix $\Sigma$ is computed explicitly.
-    Upper bound: the bound derived in
-    Lemma~\ref{lem:yurinskii_kde_eigenvalue}
-    is shown.
-  }
-  \label{fig:yurinskii_min_eig}
-\end{figure}
-
-The discussion in this section focuses on the strong approximation of the
-centered process $\hat g(x)-\E [ \hat g(x) ]$. In practice, the goal is often
-rather to approximate the feasible process $\hat g(x)- g(x)$. The difference
-between these is captured by the smoothing bias $\E [ \hat g(x) ] - g(x)$,
-which is straightforward to control in this case with
-$\sup_{x \in \cX} \big| \E [ \hat g(x) ] - g(x) \big|
-\lesssim \frac{h}{a} e^{-a^2 / (2 h^2)}$.
-See Section \ref{sec:yurinskii_nonparametric} for further
-comments.
-
-\subsection{General result for martingale empirical processes}
-
-We now give our general result on a strong approximation for
-martingale empirical processes, obtained by applying
-the first result \eqref{eq:yurinskii_sa_martingale_order_2} in
-Corollary~\ref{cor:yurinskii_sa_martingale} with $p=\infty$
-to a discretization of the empirical process,
-as in Section~\ref{sec:yurinskii_kde}.
-We then control the increments in the stochastic processes
-using chaining with Orlicz norms,
-but note that other tools are available,
-including generalized entropy with bracketing \citep{geer2000empirical}
-and sequential symmetrization \citep{rakhlin2015sequential}.
-
-A class of functions is said to be \emph{pointwise measurable}
-if it contains a countable subclass which is dense under
-the pointwise convergence topology.
-For a finite class $\cF$, write
-$\cF(x) = \big(f(x) : f \in \cF\big)$.
-Define the set of Orlicz functions
-%
-\begin{align*}
-  \Psi
-  &=
-  \left\{
-    \psi: [0, \infty) \to [0, \infty)
-    \text{ convex increasing, }
-    \psi(0) = 0,\
-    \limsup_{x,y \to \infty} \tfrac{\psi(x) \psi(y)}{\psi(C x y)} < \infty
-    \text{ for } C > 0
-  \right\}
-\end{align*}
-%
-and, for real-valued $Y$, the Orlicz norm
-$\vvvert Y \vvvert_\psi
-= \inf
-\left\{ C > 0:
-  \E \left[ \psi(|Y|/C) \leq 1 \right]
-\right\}$
-as in \citet[Section~2.2]{van1996weak}.
-
-\begin{proposition}[Strong approximation for martingale empirical processes]%
-  \label{pro:yurinskii_emp_proc}
-
-  Let $X_i$ be random variables for $1 \leq i \leq n$ taking values in a
-  measurable space $\cX$, and $\cF$ be a pointwise measurable class of
-  functions from $\cX$ to $\R$. Let $\cH_0, \ldots, \cH_n$ be a filtration such
-  that each $X_i$ is $\cH_i$-measurable, with $\cH_0$ the trivial
-  $\sigma$-algebra, and suppose that $\E[f(X_i) \mid \cH_{i-1}] = 0$ for all
-  $f \in \cF$. Define $S(f) = \sum_{i=1}^n f(X_i)$ for $f\in\cF$ and let
-  $\Sigma: \cF \times \cF \to \R$ be an almost surely positive semi-definite
-  $\cH_0$-measurable random function. Suppose that for a non-random
-  metric $d$ on $\cF$, constant $L$, and $\psi \in \Psi$,
-  %
-  \begin{align}%
-    \label{eq:yurinskii_emp_proc_var}
-    \Sigma(f,f) - 2\Sigma(f,f') + \Sigma(f',f')
-    + \bigvvvert S(f) - S(f') \bigvvvert_\psi^2
-    &\leq L^2 d(f,f')^2 \quad \text{a.s.}
-  \end{align}
-  %
-  Then for each $\eta > 0$ there is a process $T(f)$
-  which, conditional on $\cH_0$, is zero-mean and Gaussian,
-  satisfying $\E\big[ T(f) T(f') \mid \cH_0 \big] = \Sigma(f,f')$
-  for all $f, f' \in \cF$, and for all $t > 0$ has
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{f \in \cF}
-      \big| S(f) - T(f) \big|
-      \geq C_\psi(t + \eta)
-    \right)
-    \leq
-    C_\psi
-    \inf_{\delta > 0}
-    \inf_{\cF_\delta}
-    \Bigg\{
-      \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } \\
-      &\qquad\quad+
-      \left(\frac{\sqrt{\log 2 |\cF_\delta|}
-      \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3}
-      + \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1}
-      + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right)
-    \Bigg\}
-  \end{align*}
-  %
-  where $\cF_\delta$ is any finite $\delta$-cover of $(\cF,d)$
-  and $C_\psi$ is a constant depending only on $\psi$, with
-  %
-  \begin{align*}
-    \beta_\delta
-    &= \sum_{i=1}^n
-    \E\left[ \|\cF_\delta(X_i)\|^2_2\|\cF_\delta(X_i)\|_\infty
-      + \|V_i(\cF_\delta)^{1/2}Z_i\|^2_2
-    \|V_i(\cF_\delta)^{1/2}Z_i\|_\infty \right], \\
-    V_i(\cF_\delta)
-    &=
-    \E\big[\cF_\delta(X_i) \cF_\delta(X_i)^\T \mid \cH_{i-1} \big],
-    \hspace*{27.7mm}
-    \Omega_\delta
-    =
-    \sum_{i=1}^n V_i(\cF_\delta) - \Sigma(\cF_\delta), \\
-    J_\psi(\delta)
-    &=
-    \int_0^\delta \psi^{-1}\big( N_\varepsilon \big)
-    \diff{\varepsilon}
-    + \delta \psi^{-1} \big( N_\delta^2 \big),
-    \hspace*{19mm}
-    J_2(\delta)
-    = \int_0^\delta \sqrt{\log N_\varepsilon}
-    \diff{\varepsilon},
-  \end{align*}
-  %
-  where $N_\delta = N(\delta, \cF, d)$
-  is the $\delta$-covering number of $(\cF, d)$
-  and $Z_i$ are i.i.d.\ $\cN\big(0, I_{|\cF_\delta|}\big)$
-  independent of $\cH_n$.
-  If $\cF_\delta$ is a minimal $\delta$-cover
-  of $(\cF, d)$, then $|\cF_\delta| = N_\delta$.
-\end{proposition}
-
-Proposition~\ref{pro:yurinskii_emp_proc}
-is given in a rather general form to accommodate a range of different
-settings and applications.
-In particular, consider the following well-known Orlicz functions.
-%
-\begin{description}
-
-  \item[Polynomial:]
-    $\psi(x) = x^a$ for $a \geq 2$
-    has $\vvvert X \vvvert_2 \leq \vvvert X \vvvert_\psi$ and
-    $\sqrt{\log x} \leq \sqrt{a} \psi^{-1}(x)$.
-
-  \item[Exponential:]
-    $\psi(x) = \exp(x^a) - 1$ for $a \in [1,2]$
-    has $\vvvert X \vvvert_2 \leq 2\vvvert X \vvvert_\psi$ and
-    $\sqrt{\log x} \leq \psi^{-1}(x)$.
-
-  \item[Bernstein:]
-    $\psi(x) = \exp
-    \Big(
-      \Big(\frac{\sqrt{1+2ax}-1}{a}\Big)^{2}
-    \Big)-1$
-    for $a > 0$ has
-    $\vvvert X \vvvert_2 \leq (1+a)\vvvert X \vvvert_\psi$ \\ and
-    $\sqrt{\log x}~\leq~\psi^{-1}(x)$.
-
-\end{description}
-%
-For these Orlicz functions and when $\Sigma(f, f') = \Cov[S(f), S(f')]$ is
-non-random, the terms involving $\Sigma$ in \eqref{eq:yurinskii_emp_proc_var}
-can be
-controlled by the Orlicz $\psi$-norm term; similarly, $J_2$ is bounded by
-$J_\psi$. Further, $C_\psi$ can be replaced by a universal constant $C$ which
-does not depend on the parameter $a$. See Section~2.2 in \citet{van1996weak}
-for details. If the conditional third moments of $f(X_i)$ given $\cH_{i-1}$ are
-all zero (if $f$ and $X_i$ are appropriately symmetric, for example), then the
-second inequality in Corollary~\ref{cor:yurinskii_sa_martingale} can be applied
-to obtain
-a tighter coupling inequality; the details of this are omitted for brevity, and
-the proof would proceed in exactly the same manner.
-
-In general, however, Proposition~\ref{pro:yurinskii_emp_proc} allows for a
-random
-covariance function, yielding a coupling to a stochastic process that is
-Gaussian only conditionally. Such a process can equivalently be viewed as a
-mixture of Gaussian processes, writing $T=\Sigma^{1/2} Z$ with an operator
-square root and where $Z$ is a Gaussian white noise on $\cF$ independent of
-$\cH_0$. This extension is in contrast with much of the existing strong
-approximation and empirical process literature, which tends to focus on
-couplings and weak convergence results with marginally Gaussian processes
-\citep{settati2009gaussian,chernozhukov2016empirical}.
-
-A similar approach was taken by \citet{berthet2006revisiting}, who used a
-Gaussian coupling due to \citet{zaitsev1987estimates,zaitsev1987gaussian} along
-with a discretization method to obtain strong approximations for empirical
-processes with independent data. They handled fluctuations in the stochastic
-processes with uniform $L^2$ covering numbers and bracketing numbers where we
-opt instead for chaining with Orlicz norms. Our version using the martingale
-Yurinskii coupling can improve upon theirs in approximation rate even for
-independent data in certain circumstances. Suppose the setup of
-Proposition~1 in \citet{berthet2006revisiting}; that is, $X_1, \ldots, X_n$ are
-i.i.d.\ and $\sup_{\cF} \|f\|_\infty \leq M$, with the VC-type assumption
-$\sup_\Q N(\varepsilon, \cF, d_\Q) \leq c_0 \varepsilon^{-\nu_0}$ where
-$d_\Q(f,f')^2 = \E_\Q\big[(f-f')^2\big]$ for a measure $\Q$ on $\cX$ and
-$M, c_0, \nu_0$ are constants. Using uniform $L^2$ covering numbers
-rather than Orlicz chaining in our Proposition~4 gives the following.
-Firstly, as $X_i$ are i.i.d., take $\Sigma(f, f') = \Cov[S(f), S(f')]$ so
-$\Omega_\delta = 0$. Let $\cF_\delta$ be a minimal $\delta$-cover of
-$(\cF, d_\P)$ with cardinality $N_\delta \lesssim \delta^{-\nu_0}$ where
-$\delta \to 0$. It is easy to show that
-$\beta_\delta \lesssim n \delta^{-\nu_0} \sqrt{\log(1/\delta)}$.
-Theorem~2.2.8 and Theorem~2.14.1 in \citet{van1996weak} then give
-%
-\begin{align*}
-  \E\left[
-    \sup_{d_\P(f,f') \leq \delta}
-    \Big(
-      |S(f) - S(f')|
-      + |T(f) - T(f')|
-    \Big)
-  \right]
-  &\lesssim
-  \sup_\Q
-  \int_0^\delta
-  \sqrt{n \log N(\varepsilon, \cF, d_\Q)}
-  \diff{\varepsilon} \\
-  &\lesssim
-  \delta \sqrt{n\log(1/\delta)},
-\end{align*}
-%
-where we used the VC-type property to bound the entropy integral.
-So by our Proposition~\ref{pro:yurinskii_emp_proc},
-for any sequence $R_n \to \infty$
-(see Remark~\ref{rem:yurinskii_coupling_bounds_probability}),
-%
-\begin{align*}
-  \sup_{f \in \cF}
-  \big| S(f) - T(f) \big|
-  &\lesssim_\P
-  n^{1/3} \delta^{-\nu_0/3}
-  \sqrt{\log(1/\delta)} R_n
-  + \delta \sqrt{n\log(1/\delta)}
-  \lesssim_\P
-  n^{\frac{2+\nu_0}{6+2\nu_0}}
-  \sqrt{\log n} R_n,
-\end{align*}
-%
-where we minimized over $\delta$ in the last step.
-\citet[Proposition~1]{berthet2006revisiting} achieved
-%
-\begin{align*}
-  \sup_{f \in \cF}
-  \big| S(f) - T(f) \big|
-  &\lesssim_\P
-  n^{\frac{5\nu_0}{4+10\nu_0}}
-  (\log n)^{\frac{4+5\nu_0}{4+10\nu_0}},
-\end{align*}
-%
-showing that our approach achieves a better approximation rate whenever
-$\nu_0 > 4/3$. In particular, our method is superior in richer function classes
-with larger VC-type dimension. For example, if $\cF$ is smoothly parameterized
-by $\theta \in \Theta \subseteq \R^d$ where $\Theta$ contains an open set, then
-$\nu_0 > 4/3$ corresponds to $d \geq 2$ and our rate is better as soon as the
-parameter space is more than one-dimensional. The difference in approximation
-rate is due to Zaitsev's coupling having better dependence on the sample size
-but worse dependence on the dimension. In particular, Zaitsev's coupling is
-stated only in $\ell^2$-norm and hence
-\citet[Equation~5.3]{berthet2006revisiting} are compelled to use the inequality
-$\|\cdot\|_\infty \leq \|\cdot\|_2$ in the coupling step, a bound which is
-loose when the dimension of the vectors (here on the order of
-$\delta^{-\nu_0}$) is even moderately large. We use the fact that our version
-of Yurinskii's coupling applies directly to the supremum norm, giving sharper
-dependence on the dimension.
-
-In Section~\ref{sec:yurinskii_local_poly} we apply
-Proposition~\ref{pro:yurinskii_emp_proc} to
-obtain strong approximations for local polynomial estimators in the
-nonparametric regression setting. In contrast with the series estimators of the
-upcoming Section~\ref{sec:yurinskii_series}, local polynomial estimators are
-not linearly
-separable and hence cannot be analyzed directly using the finite-dimensional
-Corollary~\ref{cor:yurinskii_sa_martingale}.
-
-\section{Applications to nonparametric regression}
-\label{sec:yurinskii_nonparametric}
-
-We illustrate the applicability of our previous strong approximation results
-with two substantial and classical examples in nonparametric regression
-estimation. Firstly, we present an analysis of partitioning-based series
-estimators, where we can apply Corollary~\ref{cor:yurinskii_sa_martingale}
-directly due to an intrinsic linear separability property. Secondly, we
-consider local polynomial estimators, this time using
-Proposition~\ref{pro:yurinskii_emp_proc} due to a non-linearly separable
-martingale empirical process.
-
-\subsection{Partitioning-based series estimators}
-\label{sec:yurinskii_series}
-
-Partitioning-based least squares methods are essential tools for estimation and
-inference in nonparametric regression, encompassing splines, piecewise
-polynomials, compactly supported wavelets and decision trees as special cases.
-See \citet{cattaneo2020large} for further details and references throughout
-this section. We illustrate the usefulness of
-Corollary~\ref{cor:yurinskii_sa_martingale}
-by deriving a Gaussian strong approximation for partitioning series estimators
-based on multivariate martingale data. Proposition~\ref{pro:yurinskii_series}
-shows how
-we achieve the best known rate of strong approximation for independent data by
-imposing an additional mild $\alpha$-mixing condition to control the time
-series dependence of the regressors.
-
-Consider the nonparametric regression setup with martingale difference
-residuals defined by $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$
-where the regressors $W_i$ have compact connected support $\cW \subseteq \R^m$,
-$\cH_i$ is the $\sigma$-algebra generated by
-$(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$,
-$\E[\varepsilon_i \mid \cH_{i-1}] = 0$ and $\mu: \cW \to \R$ is the estimand.
-Let $p(w)$ be a $k$-dimensional vector of bounded basis functions on $\cW$
-which are locally supported on a quasi-uniform partition
-\citep[Assumption~2]{cattaneo2020large}. Under minimal regularity conditions,
-the least-squares partitioning-based series estimator is
-$\hat\mu(w) = p(w)^{\T} \hat H^{-1} \sum_{i=1}^n p(W_i) Y_i$
-with $\hat H = \sum_{i=1}^n p(W_i) p(W_i)^\T$.
-The approximation power of the estimator $\hat\mu(w)$ derives from letting
-$k\to\infty$ as $n\to\infty$. The assumptions made on $p(w)$ are mild enough to
-accommodate splines, wavelets, piecewise polynomials, and certain types of
-decision trees. For such a tree, $p(w)$ is comprised of indicator functions
-over $k$ axis-aligned rectangles forming a partition of $\cW$ (a Haar basis),
-provided that the partitions are constructed using independent data
-(e.g., with sample splitting).
-
-Our goal is to approximate the law of the stochastic process
-$(\hat\mu(w)-\mu(w):w\in\cW)$, which upon rescaling is typically not
-asymptotically tight as $k \to \infty$ and thus does not converge weakly.
-Nevertheless, exploiting the intrinsic linearity of the estimator $\hat\mu(w)$,
-we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly to construct
-a Gaussian
-strong approximation. Specifically, we write
-%
-\begin{equation*}
-  \hat\mu(w) - \mu(w)
-  = p(w)^\T H^{-1} S
-  + p(w)^\T \big(\hat H^{-1} - H^{-1}\big) S
-  + \Bias(w),
-\end{equation*}
-%
-where $H= \sum_{i=1}^n \E\left[p(W_i) p(W_i)^\T\right]$
-is the expected outer product matrix, $S = \sum_{i=1}^n p(W_i) \varepsilon_i$
-is the score vector, and
-$\Bias(w) = p(w)^{\T} \hat H^{-1}\sum_{i=1}^n p(W_i) \mu(W_i) - \mu(w)$.
-Imposing some mild time series restrictions and assuming stationarity,
-it is not difficult to show
-(see Section~\ref{sec:yurinskii_app_proofs})
-that $\|\hat H - H\|_1 \lesssim_\P \sqrt{n k}$ and
-$\sup_{w\in\cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$
-for some $\gamma>0$, depending on the specific structure of the basis
-functions, the dimension $m$ of the regressors, and the smoothness of the
-regression function $\mu$. It remains to study the $k$-dimensional
-mean-zero martingale $S$ by applying
-Corollary~\ref{cor:yurinskii_sa_martingale} with
-$X_i=p(W_i) \varepsilon_i$. Controlling the convergence of the quadratic
-variation term $\E[\|\Omega\|_2]$ requires some time series dependence
-assumptions; we impose an $\alpha$-mixing condition on $(W_1, \ldots, W_n)$ for
-illustration \citep{bradley2005basic}.
-
-\begin{proposition}[Strong approximation for partitioning series estimators]%
-  \label{pro:yurinskii_series}
-  %
-  Consider the nonparametric regression setup described above
-  and further assume the following:
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      $(W_i, \varepsilon_i)_{1 \leq i \leq n}$
-      is strictly stationary.
-
-    \item
-      $W_1, \ldots, W_n$ is $\alpha$-mixing with mixing coefficients
-      satisfying $\sum_{j=1}^\infty \alpha(j) < \infty$.
-
-    \item
-      $W_i$ has a Lebesgue density on $\cW$
-      which is bounded above and away from zero.
-
-    \item
-      $\E\big[|\varepsilon_i|^3 \big] < \infty$
-      and
-      $\E\big[\varepsilon_i^2 \mid \cH_{i-1}\big]=\sigma^2(W_i)$
-      is bounded away from zero.
-
-    \item
-      $p(w)$ is a basis with $k$ features satisfying
-      Assumptions~2 and~3 in \citet{cattaneo2020large}.
-
-  \end{enumerate}
-  %
-  Then, for any sequence $R_n \to \infty$,
-  there is a zero-mean Gaussian process
-  $G(w)$ indexed on $\cW$
-  with $\Var[G(w)] \asymp\frac{k}{n}$
-  satisfying
-  $\Cov[G(w), G(w')]
-  = \Cov[p(w)^\T H^{-1} S,\, p(w')^\T H^{-1} S]$
-  and
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \hat\mu(w) - \mu(w) - G(w) \right|
-    &\lesssim_\P
-    \sqrt{\frac{k}{n}}
-    \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} R_n
-    + \sup_{w \in \cW} |\Bias(w)|
-  \end{align*}
-  %
-  assuming the number of basis functions satisfies $k^3 / n \to 0$.
-  If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \hat\mu(w) - \mu(w) - G(w) \right|
-    &\lesssim_\P
-    \sqrt{\frac{k}{n}}
-    \left( \frac{k^3 (\log k)^2}{n} \right)^{1/4} R_n
-    + \sup_{w \in \cW} |\Bias(w)|.
-  \end{align*}
-  %
-\end{proposition}
-
-The core concept in the proof of Proposition~\ref{pro:yurinskii_series} is to
-apply
-Corollary~\ref{cor:yurinskii_sa_martingale} with
-$S = \sum_{i=1}^n p(W_i) \varepsilon_i$
-and $p=\infty$ to construct $T \sim \cN\big(0, \Var[S]\big)$ such that
-$\|S - T \|_\infty$ is small, and then setting $G(w) = p(w)^\T H^{-1} T$. So
-long as the bias can be appropriately controlled, this result allows for
-uniform inference procedures such as uniform confidence bands or shape
-specification testing. The condition $k^3 / n \to 0$ is the same (up to logs)
-as that imposed by \citet{cattaneo2020large} for i.i.d. data, which gives the
-best known strong approximation rate for this problem. Thus,
-Proposition~\ref{pro:yurinskii_series} gives the same best approximation rate
-without
-requiring any extra restrictions for $\alpha$-mixing time series data.
-
-Our results improve substantially on \citet[Theorem~1]{li2020uniform}: using
-the notation of our Corollary~\ref{cor:yurinskii_sa_martingale}, and with any
-sequence
-$R_n \to \infty$, a valid (see
-Remark~\ref{rem:yurinskii_coupling_bounds_probability})
-version of their martingale Yurinskii coupling is
-%
-\begin{align*}
-  \|S-T\|_2
-  \lesssim_\P
-  d^{1/2} r^{1/2}_n
-  + (B_n d)^{1/3} R_n,
-\end{align*}
-%
-where $B_n = \sum_{i=1}^n \E[\|X_i\|_2^3]$ and $r_n$ is a term controlling the
-convergence of the quadratic variation, playing a similar role to our
-term $\E[\|\Omega\|_2]$. Under the assumptions of our
-Proposition~\ref{pro:yurinskii_series}, applying this
-result with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ yields a rate no better
-than $\|S-T\|_2 \lesssim_\P (n k)^{1/3} R_n$. As such, they attain a rate of
-strong approximation no faster than
-%
-\begin{align*}
-  \sup_{w \in \cW}
-  \left| \hat\mu(w) - \mu(w) - G(w) \right|
-  &\lesssim_\P
-  \sqrt{\frac{k}{n}}
-  \left( \frac{k^5}{n} \right)^{1/6} R_n
-  + \sup_{w \in \cW} |\Bias(w)|.
-\end{align*}
-%
-Hence, for this approach to yield a valid strong approximation, the number of
-basis functions must satisfy $k^5/n \to 0$, a more restrictive assumption than
-our $k^3 / n \to 0$ (up to logs). This difference is due to
-\citet{li2020uniform} using the $\ell^2$-norm version of Yurinskii's coupling
-rather than the recently established $\ell^\infty$ version. Further,
-our approach allows for an improved rate of distributional approximation
-whenever the residuals have zero conditional third moment.
-
-To illustrate the statistical applicability of
-Proposition~\ref{pro:yurinskii_series}, consider constructing a feasible uniform
-confidence band for the regression function $\mu$, using standardization and
-Studentization for statistical power improvements. We assume throughout that
-the bias is negligible. Proposition~\ref{pro:yurinskii_series} and
-anti-concentration for
-Gaussian suprema \citep[Corollary~2.1]{chernozhukov2014anti} yield
-a distributional approximation for the supremum statistic whenever
-$k^3(\log n)^6 / n \to 0$, giving
-%
-\begin{align*}
-  \sup_{t \in \R}
-  \left|
-  \P\left(
-    \sup_{w \in \cW}
-    \left|
-    \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}}
-    \right| \leq t
-  \right)
-  -
-  \P\left(
-    \sup_{w \in \cW}
-    \left|
-    \frac{G(w)}{\sqrt{\rho(w,w)}}
-    \right| \leq t
-  \right)
-  \right|
-  &\to 0,
-\end{align*}
-%
-where $\rho(w,w') = \E[G(w)G(w')]$. Further, by a Gaussian--Gaussian
-comparison result \citep[Lemma~3.1]{chernozhukov2013gaussian} and
-anti-concentration, we show (see the proof of
-Proposition~\ref{pro:yurinskii_series}) that with $\bW = (W_1, \ldots, W_n)$ and
-$\bY = (Y_1, \ldots, Y_n)$,
-%
-\begin{align*}
-  \sup_{t \in \R}
-  \left|
-  \P\left(
-    \sup_{w \in \cW}
-    \left|
-    \frac{\hat\mu(w)-\mu(w)}{\sqrt{\hat\rho(w,w)}}
-    \right| \leq t
-  \right)
-  - \P\left(
-    \sup_{w \in \cW}
-    \left|
-    \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
-    \right| \leq t \biggm| \bW, \bY
-  \right)
-  \right|
-  &\to_\P 0,
-\end{align*}
-%
-where $\hat G(w)$ is a zero-mean Gaussian process
-conditional on $\bW$ and $\bY$ with conditional covariance function
-$\hat\rho(w,w')
-=\E\big[\hat G(w) \hat G(w') \mid \bW, \bY \big]
-= p(w)^\T \hat H^{-1} \hat V \hat H^{-1}p(w')$
-for some estimator $\hat V$ satisfying
-$\frac{k (\log n)^2}{n}
-\big\|\hat V-\Var[S]\big\|_2 \to_\P 0$.
-For example, one could use the plug-in estimator
-$\hat V=\sum_{i=1}^n p(W_i) p(W_i)^\T \hat{\sigma}^2(W_i)$
-where $\hat{\sigma}^2(w)$ satisfies
-$(\log n)^2 \sup_{w \in \cW}
-|\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$.
-This leads to the following feasible and asymptotically valid
-$100(1-\tau)\%$
-uniform confidence band for partitioning-based series estimators
-based on martingale data.
-
-\begin{proposition}[Feasible uniform confidence bands for partitioning
-  series estimators]%
-  \label{pro:yurinskii_series_feasible}
-  %
-  Assume the setup of the preceding section. Then
-  %
-  \begin{align*}
-    \P\Big(
-      \mu(w) \in
-      \Big[
-        \hat\mu(w) \pm \hat q(\tau)
-        \sqrt{\hat\rho(w,w)}
-      \Big]
-      \ \text{for all }
-    w \in \cW \Big)
-    \to 1-\tau,
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    \hat{q}(\tau)
-    &=
-    \inf
-    \left\{
-      t \in \R:
-      \P\left(
-        \sup_{w \in \cW}
-        \left|
-        \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
-        \right|
-        \leq t
-        \Bigm| \bW, \bY
-      \right)
-      \geq \tau
-    \right\}
-  \end{align*}
-  %
-  is the conditional quantile of the supremum of the Studentized Gaussian
-  process. This can be estimated by resampling the conditional law of
-  $\hat G(w) \mid \bW, \bY$ with a discretization of $w \in \cW$.
-\end{proposition}
-
-\subsection{Local polynomial estimators}
-\label{sec:yurinskii_local_poly}
-
-As a second example application we consider nonparametric regression estimation
-with martingale data employing local polynomial methods
-\citep{fan1996local}. In contrast with the partitioning-based series
-methods of Section~\ref{sec:yurinskii_series}, local polynomials induce
-stochastic
-processes which are not linearly separable, allowing us to showcase the
-empirical process result given in Proposition \ref{pro:yurinskii_emp_proc}.
-
-As before, suppose that
-$Y_i = \mu(W_i) + \varepsilon_i$
-for $ 1 \leq i \leq n$
-where $W_i$ has compact connected support $\cW \subseteq \R^m$,
-$\cH_i$ is the $\sigma$-algebra generated by
-$(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$,
-$\E[\varepsilon_i \mid \cH_{i-1}] = 0$,
-and $\mu: \cW \to \R$ is the estimand. Let $K$ be a kernel function on $\R^m$
-and $K_h(w) = h^{-m} K(w/h)$ for some bandwidth $h > 0$.
-Take $\gamma \geq 0$ a fixed polynomial order and let
-$k = (m+\gamma)!/(m!\gamma!)$ be the number of monomials up to order $\gamma$.
-Using multi-index notation,
-let $p(w)$ be the $k$-dimensional vector
-collecting the monomials $w^{\kappa}/\kappa!$
-for $0 \leq |\kappa| \leq \gamma$,
-and set $p_h(w) = p(w/h)$.
-The local polynomial regression estimator of $\mu(w)$ is,
-with $e_1 = (1, 0, \ldots, 0)^\T \in \R^k$ the first standard unit vector,
-%
-\begin{align*}
-  \hat{\mu}(w)
-  &=
-  e_1^\T\hat{\beta}(w)
-  &\text{where} &
-  &\hat{\beta}(w)
-  &=
-  \argmin_{\beta \in \R^{k}}
-  \sum_{i=1}^n
-  \left(Y_i - p_h(W_i-w)^\T \beta \right)^2
-  K_h(W_i-w).
-\end{align*}
-
-Our goal is again to approximate the distribution of the entire stochastic
-process, $(\hat{\mu}(w)-\mu(w):w\in\cW)$, which upon rescaling is non-Donsker
-if $h \to 0$, and decomposes as follows:
-%
-\begin{align*}
-  \hat{\mu}(w)-\mu(w)
-  &= e_1^\T H(w)^{-1} S(w)
-  + e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w)
-  + \Bias(w)
-\end{align*}
-%
-where
-$\hat H(w) = \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T$,
-$H(w) = \E \big[ \hat H(w) \big]$,
-$S(w)= \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i$,
-and
-$\Bias(w) = e_1^\T \hat H(w)^{-1}
-\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w)$.
-A key distinctive feature of local polynomial regression is that both
-$\hat H(w)$ and $S(w)$ are functions of the evaluation point $w\in\cW$;
-contrast this with the partitioning-based series estimator discussed in
-Section~\ref{sec:yurinskii_series}, for which neither $\hat H$ nor $S$ depend
-on $w$.
-Therefore we use Proposition \ref{pro:yurinskii_emp_proc} to obtain a Gaussian
-strong
-approximation for the martingale empirical process directly.
-
-Under mild regularity conditions, including stationarity for simplicity
-and an $\alpha$-mixing assumption on the time-dependence of the data, we show
-$\sup_{w\in\cW} \|\hat H(w)-H(w)\|_2
-\lesssim_\P \sqrt{n h^{-2m}\log n}$.
-Further,
-$\sup_{w\in\cW} |\Bias(w)|
-\lesssim_\P h^\gamma$
-provided that the regression function is sufficiently smooth.
-It remains to analyze the martingale empirical process given by
-$\big(e_1^\T H(w)^{-1} S(w) : w\in\cW\big)$
-via Proposition \ref{pro:yurinskii_emp_proc} by setting
-%
-\begin{align*}
-  \cF = \left\{
-    (W_i, \varepsilon_i) \mapsto
-    e_1^\T H(w)^{-1}
-    K_h(W_i-w) p_h(W_i-w) \varepsilon_i
-    : w \in \cW
-  \right\}.
-\end{align*}
-%
-With this approach, we obtain the following result.
-
-\begin{proposition}[Strong approximation for local polynomial estimators]%
-  \label{pro:yurinskii_local_poly}
-
-  Under the nonparametric regression setup described above,
-  assume further that
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      $(W_i, \varepsilon_i)_{1 \leq i \leq n}$
-      is strictly stationary.
-
-    \item
-      $(W_i, \varepsilon_i)_{1 \leq i \leq n}$
-      is $\alpha$-mixing with mixing coefficients
-      $\alpha(j) \leq e^{-2 j / C_\alpha}$
-      for some $C_\alpha > 0$.
-
-    \item
-      $W_i$ has a Lebesgue density on $\cW$
-      which is bounded above and away from zero.
-
-    \item
-      $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$
-      for $C_\varepsilon > 0$ and
-      $\E\left[\varepsilon^2_i \mid \cH_{i-1}\right]=\sigma^2(W_i)$
-      is bounded away from zero.
-
-    \item
-      $K$ is a non-negative Lipschitz
-      compactly supported kernel with
-      $\int K(w) \diff{w} = 1$.
-
-  \end{enumerate}
-  %
-  Then for any $R_n \to \infty$,
-  there is a zero-mean Gaussian process
-  $T(w)$ on $\cW$
-  with $\Var[T(w)] \asymp\frac{1}{n h^m}$
-  satisfying
-  $\Cov[T(w), T(w')]
-  = \Cov[e_1^\T H(w)^{-1} S(w),\, e_1^\T H(w')^{-1} S(w')]$
-  and
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|\hat \mu(w) - \mu(w) - T(w) \right|
-    &\lesssim_\P
-    \frac{R_n}{\sqrt{n h^m}}
-    \left(
-      \frac{(\log n)^{m+4}}{n h^{3m}}
-    \right)^{\frac{1}{2m+6}}
-    + \sup_{w \in \cW} |\Bias(w)|,
-  \end{align*}
-  %
-  provided that the bandwidth sequence satisfies
-  $n h^{3m} \to \infty$.
-  %
-\end{proposition}
-
-If the residuals further satisfy
-$\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$, then
-a third-order Yurinskii coupling delivers an improved rate of strong
-approximation for Proposition~\ref{pro:yurinskii_local_poly}; this is omitted
-here for
-brevity. For completeness, the proof of
-Proposition~\ref{pro:yurinskii_local_poly}
-verifies that if the regression function $\mu(w)$ is $\gamma$ times
-continuously differentiable on $\cW$ then
-$\sup_w |\Bias(w)| \lesssim_\P h^\gamma$. Further, the assumption that $p(w)$
-is a vector of monomials is unnecessary in general; any collection of bounded
-linearly independent functions which exhibit appropriate approximation power
-will suffice \citep{eggermont2009maximum}. As such, we can encompass local
-splines and wavelets, as well as polynomials, and also choose whether or not to
-include interactions between the regressor variables. The bandwidth restriction
-of $n h^{3m} \to \infty$ is analogous to that imposed in
-Proposition~\ref{pro:yurinskii_series} for partitioning-based series
-estimators, and as
-far as we know, has not been improved upon for non-i.i.d.\ data.
-
-Applying an anti-concentration result for Gaussian process suprema, such as
-Corollary~2.1 in \citet{chernozhukov2014anti}, allows one to write a
-Kolmogorov--Smirnov bound comparing the law of
-$\sup_{w \in \cW}|\hat\mu(w) - \mu(w)|$ to that of $\sup_{w \in \cW}|T(w)|$.
-With an appropriate covariance estimator, we can further replace $T(w)$ by a
-feasible version $\hat T(w)$ or its Studentized counterpart, enabling
-procedures for uniform inference analogous to the confidence bands constructed
-in Section~\ref{sec:yurinskii_series}. We omit the details of this to conserve
-space but
-note that our assumptions on $W_i$ and $\varepsilon_i$ ensure that
-Studentization is possible even when the discretized covariance matrix has
-small eigenvalues (Section~\ref{sec:yurinskii_kde}), as we normalize only by
-the diagonal
-entries. \citet[Remark~3.1]{chernozhukov2014gaussian} achieve better rates for
-approximating the supremum of the $t$-process based on i.i.d.\ data in
-Kolmogorov--Smirnov distance by bypassing the step where we first approximate
-the entire stochastic process (see Section~\ref{sec:yurinskii_emp_proc} for a
-discussion).
-Nonetheless, our approach targeting the entire process allows for a
-potential future
-treatment of other functionals as well as the supremum.
-
-We finally remark that in this setting of kernel-based local empirical
-processes, it is essential that our initial strong approximation result
-(Corollary~\ref{cor:yurinskii_sa_martingale}) does not impose a lower bound on
-the
-eigenvalues of the variance matrix $\Sigma$. This effect was demonstrated by
-Lemma \ref{lem:yurinskii_kde_eigenvalue},
-Figure~\ref{fig:yurinskii_min_eig}, and their surrounding discussion in
-Section~\ref{sec:yurinskii_kde}. As such, the result of \citet{li2020uniform} is
-unsuited for this application, even in its simplest formulation,
-due to the strong minimum eigenvalue assumption.
-
-\section{Conclusion}
-\label{sec:yurinskii_conclusion}
-
-In this chapter we introduced as our main result a new version of Yurinskii's
-coupling which strictly generalizes all previously known forms of the result.
-Our formulation gave a Gaussian mixture coupling for approximate martingale
-vectors in $\ell^p$-norm where $1 \leq p \leq \infty$, with no restrictions on
-the minimum eigenvalues of the associated covariance matrices. We further
-showed how to obtain an improved approximation whenever third moments of the
-data are negligible. We demonstrated the applicability of this main result by
-first deriving a user-friendly version, and then specializing it to mixingales,
-martingales, and independent data, illustrating the benefits with a collection
-of simple factor models. We then considered the problem of constructing uniform
-strong approximations for martingale empirical processes, demonstrating how our
-new Yurinskii coupling can be employed in a stochastic process setting. As
-substantive illustrative applications of our theory to some
-well-established problems in statistical methodology, we showed how to use our
-coupling results for both vector-valued and empirical process-valued
-martingales in developing uniform inference procedures for partitioning-based
-series estimators and local polynomial models in nonparametric regression. At
-each stage we addressed issues of feasibility, compared our work with the
-existing literature, and provided implementable statistical inference
-procedures. The work in this chapter is based on \citet{cattaneo2022yurinskii}.
-
-\appendix
-
-
-\chapter{Supplement to Inference with Mondrian Random Forests}
-\label{app:mondrian}
-
-In this section we present the full proofs of all our results,
-and also state some useful technical preliminary and
-intermediate lemmas, along with some further properties
-of the Mondrian process not required for our primary analysis.
-See Section~\ref{sec:mondrian_overview_proofs} in the main text
-for an overview of the main proof strategies and a discussion of
-the challenges involved.
-We use the following simplified notation for convenience,
-whenever it is appropriate.
-We write $\I_{i b}(x) = \I \left\{ X_i \in T_b(x) \right\}$
-and $N_b(x) = \sum_{i=1}^{n} \I_{i b}(x)$,
-as well as $\I_b(x) = \I \left\{ N_b(x) \geq 1 \right\}$.
-
-\section{Preliminary lemmas}
-
-We begin by bounding the maximum size of any cell
-in a Mondrian forest containing $x$.
-This result is used regularly throughout many of our other proofs,
-and captures the ``localizing'' behavior of the Mondrian random
-forest estimator, showing that Mondrian cells have side lengths
-at most on the order of $1/\lambda$.
-
-\begin{lemma}[Upper bound on the largest cell in a Mondrian forest]%
-  \label{lem:mondrian_app_largest_cell}
-  %
-  Let $T_1, \ldots, T_b \sim \cM\big([0,1]^d, \lambda\big)$
-  and take $x \in (0,1)^d$. Then for all $t > 0$
-  %
-  \begin{align*}
-    \P \left(
-      \max_{1 \leq b \leq B}
-      \max_{1 \leq j \leq d}
-      |T_b(x)_j|
-      \geq \frac{t}{\lambda}
-    \right)
-    &\leq
-    2dB e^{-t/2}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell}]
-  %
-  We use the distribution of the Mondrian cell shape
-  \citep[Proposition~1]{mourtada2020minimax}. We have
-  $|T_b(x)_j| = \left( \frac{E_{bj1}}{\lambda} \wedge x_j \right)
-  + \left( \frac{E_{bj2}}{\lambda} \wedge (1-x_j) \right)$
-  where $E_{bj1}$ and $E_{bj2}$
-  are i.i.d.\ $\Exp(1)$ variables for
-  $1 \leq b \leq B$ and $1 \leq j \leq d$.
-  Thus $|T_b(x)_j| \leq \frac{E_{bj1} + E_{bj2}}{\lambda}$
-  so by a union bound
-  %
-  \begin{align*}
-    \P \left(
-      \max_{1 \leq b \leq B}
-      \max_{1 \leq j \leq d}
-      |T_b(x)_j|
-      \geq \frac{t}{\lambda}
-    \right)
-    &\leq
-    \P \left(
-      \max_{1 \leq b \leq B}
-      \max_{1 \leq j \leq d}
-      (E_{bj1} \vee E_{bj2})
-      \geq \frac{t}{2}
-    \right) \\
-    &\leq
-    2dB\,
-    \P \left(
-      E_{bj1}
-      \geq \frac{t}{2}
-    \right)
-    \leq
-    2dB e^{-t/2}.
-  \end{align*}
-  %
-\end{proof}
-
-Next is another localization result,
-showing that the union
-of the cells $T_b(x)$ containing $x$ does not contain ``too many''
-samples $X_i$.
-Thus the Mondrian random forest estimator fitted at $x$
-only depends on $n/\lambda^d$ (the effective sample size)
-data points up to logarithmic terms.
-
-\begin{lemma}[Upper bound on the number of active data points]%
-  \label{lem:mondrian_app_active_data}
-  Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
-  hold,
-  and define
-  $N_{\cup}(x) =
-  \sum_{i=1}^{n} \I \left\{ X_i \in \bigcup_{b=1}^{B} T_b(x) \right\}$.
-  Then for $t > 0$ and sufficiently large $n$,
-  with $\|f\|_\infty = \sup_{x \in [0,1]^d} f(x)$,
-  %
-  \begin{align*}
-    \P \left( N_{\cup}(x) > t^{d+1}
-      \frac{n}{\lambda^d}
-      \|f\|_\infty
-    \right)
-    &\leq
-    4 d B e^{-t/4}.
-  \end{align*}
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_active_data}]
-
-  Note
-  $N_\cup(x) \sim
-  \Bin\left(n, \int_{\bigcup_{b=1}^{B} T_b(x)} f(s) \diff s \right)
-  \leq \Bin\left(n, 2^d \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
-  |T_b(x)_j|^d \|f\|_\infty \right)$
-  conditionally on $\bT$.
-  If $N \sim \Bin(n,p)$ then, by Bernstein's inequality,
-  $\P\left( N \geq (1 + t) n p\right)
-  \leq \exp\left(-\frac{t^2 n^2 p^2 / 2}{n p(1-p) + t n p / 3}\right)
-  \leq \exp\left(-\frac{3t^2 n p}{6 + 2t}\right)$.
-  Thus for $t \geq 2$,
-  %
-  \begin{align*}
-    \P \left( N_{\cup}(x) > (1+t) n \frac{2^d t^d}{\lambda^d}
-      \|f\|_\infty
-      \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
-      |T_j(x)| \leq \frac{t}{\lambda}
-    \right)
-    &\leq
-    \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right).
-  \end{align*}
-  %
-  By Lemma~\ref{lem:mondrian_app_largest_cell},
-  $\P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
-  |T_j(x)| > \frac{t}{\lambda} \right)
-  \leq 2 d B e^{-t/2}$.
-  Hence
-  %
-  \begin{align*}
-    &\P \left( N_{\cup}(x) > 2^{d+1} t^{d+1} \frac{n}{\lambda^d}
-      \|f\|_\infty
-    \right) \\
-    &\quad\leq
-    \P \left( N_{\cup}(x) > 2 t n \frac{2^d t^d}{\lambda^d}
-      \|f\|_\infty
-      \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
-      |T_j(x)| \leq \frac{t}{\lambda}
-    \right)
-    + \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
-      |T_j(x)| > \frac{t}{\lambda}
-    \right) \\
-    &\quad\leq
-    \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right)
-    + 2 d B e^{-t/2}.
-  \end{align*}
-  %
-  Replacing $t$ by $t/2$ gives that for sufficiently large $n$ such that
-  $n / \lambda^d \geq 1$,
-  %
-  \begin{align*}
-    \P \left( N_{\cup}(x) > t^{d+1}
-      \frac{n}{\lambda^d}
-      \|f\|_\infty
-    \right)
-    &\leq
-    4 d B e^{-t/4}.
-  \end{align*}
-  %
-\end{proof}
-
-Next we give a series of results culminating in a
-generalized moment bound for the denominator appearing
-in the Mondrian random forest estimator.
-We begin by providing a moment bound for the truncated inverse binomial
-distribution, which will be useful for controlling
-$\frac{\I_b(x)}{N_b(x)} \leq 1 \wedge \frac{1}{N_b(x)}$
-because conditional on $T_b$ we have
-$N_b(x) \sim \Bin \left( n, \int_{T_b(x)} f(s) \diff s \right)$.
-Our constants could be significantly suboptimal but they are sufficient
-for our applications.
-
-\begin{lemma}[An inverse moment bound for the binomial distribution]%
-  \label{lem:mondrian_app_binomial_bound}
-  For $n \geq 1$ and $p \in [0,1]$,
-  let $N \sim \Bin(n, p)$ and $a_1, \ldots, a_k \geq 0$.
-  Then
-  %
-  \begin{align*}
-    \E\left[
-      \prod_{j=1}^k
-      \left(
-        1 \wedge
-        \frac{1}{N + a_j}
-      \right)
-    \right]
-    &\leq
-    (9k)^k
-    \prod_{j=1}^k
-    \left(
-      1 \wedge
-      \frac{1}{n p + a_j}
-    \right).
-  \end{align*}
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_bound}]
-  By Bernstein's inequality,
-  $\P\left( N \leq n p - t \right)
-  \leq \exp\left(-\frac{t^2/2}{n p(1-p) + t/3}\right)
-  \leq \exp\left(-\frac{3t^2}{6n p + 2t}\right)$.
-  Therefore we have
-  $\P\left( N \leq n p/4 \right)
-  \leq \exp\left(-\frac{27 n^2 p^2 / 16}{6n p + 3 n p / 2}\right)
-  = e^{-9 n p / 40}$.
-  Partitioning by this event gives
-  %
-  \begin{align*}
-    \E\left[
-      \prod_{j=1}^k
-      \left(
-        1 \wedge
-        \frac{1}{N + a_j}
-      \right)
-    \right]
-    &\leq
-    e^{-9 n p / 40}
-    \prod_{j=1}^k
-    \frac{1}{1 \vee a_j}
-    + \prod_{j=1}^k
-    \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
-    &\leq
-    \prod_{j=1}^k
-    \frac{1}{\frac{9 n p}{40k} + (1 \vee a_j)}
-    + \prod_{j=1}^k
-    \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
-    &\leq
-    \prod_{j=1}^k
-    \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)}
-    + \prod_{j=1}^k
-    \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
-    &\leq
-    2 \prod_{j=1}^k
-    \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)}
-    \leq
-    2 \prod_{j=1}^k
-    \frac{40k/9}{1 \vee \left(n p + a_j\right)} \\
-    &\leq
-    (9k)^k
-    \prod_{j=1}^k
-    \left(
-      1 \wedge
-      \frac{1}{n p + a_j}
-    \right).
-  \end{align*}
-\end{proof}
-
-Our next result is probably the most technically involved,
-allowing one to bound moments of
-(products of) $\frac{\I_b(x)}{N_b(x)}$ by the corresponding moments of
-(products of) $\frac{1}{n |T_b(x)|}$, again based on the heuristic
-that $N_b(x)$ is conditionally binomial so concentrates around
-its conditional expectation
-$n \int_{T_b(x)} f(x) \diff s \asymp n |T_b(x)|$.
-By independence of the trees,
-the latter expected products then factorize
-since the dependence on the data $X_i$ has been eliminated.
-The proof is complicated, and relies on the following induction procedure.
-First we consider the common refinement consisting of the
-subcells $\cR$ generated by all possible intersections
-of $T_b(x)$ over the selected trees
-(say $T_{b}(x), T_{b'}(x), T_{b''}(x)$
-though there could be arbitrarily many).
-Note that $N_b(x)$ is the sum of the number of
-samples $X_i$ in each such subcell in $\cR$.
-We then apply Lemma~\ref{lem:mondrian_app_binomial_bound} repeatedly
-to each subcell in $\cR$ in turn, replacing
-the number of samples $X_i$ in that subcell with its volume
-multiplied by $n$, and controlling the error incurred at each step.
-We record the subcells which have been ``checked'' in this manner
-using the class $\cD \subseteq \cR$ and proceed by finite induction,
-beginning with $\cD = \emptyset$ and ending at $\cD = \cR$.
-
-\begin{lemma}[Generalized moment bound for
-  Mondrian random forest denominators]%
-  \label{lem:mondrian_app_moment_denominator}
-
-  Suppose Assumptions~\ref{ass:mondrian_data}
-  and \ref{ass:mondrian_estimator} hold.
-  Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$
-  be independent and $k_b \geq 1$ for $1 \leq b \leq B_0$.
-  Then with $k = \sum_{b=1}^{B_0} k_b$,
-  for sufficiently large $n$,
-  %
-  \begin{align*}
-    \E\left[
-      \prod_{b=1}^{B_0}
-      \frac{\I_b(x)}{N_b(x)^{k_b}}
-    \right]
-    &\leq
-    \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k}
-    \prod_{b=1}^{B_0}
-    \E \left[
-      1 \wedge
-      \frac{1}{(n |T_b(x)|)^{k_b}}
-    \right].
-  \end{align*}
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_moment_denominator}]
-
-  Define the common refinement of
-  $\left\{ T_b(x) : 1 \leq b \leq {B_0} \right\}$ as
-  the class of sets
-  %
-  \begin{align*}
-    \cR
-    &= \left\{ \bigcap_{b=1}^{B_0} D_b :
-      D_b \in
-      \big\{ T_b(x), T_b(x)^{\comp} \big\}
-    \right\}
-    \bigsetminus
-    \left\{
-      \emptyset,\,
-      \bigcap_{b=1}^{B_0}
-      T_b(x)^\comp
-    \right\}
-  \end{align*}
-  %
-  and let $\cD \subset \cR$.
-  We will proceed by induction on the elements of $\cD$,
-  which represents the subcells we have checked,
-  starting from $\cD = \emptyset$ and finishing at $\cD = \cR$.
-  For $D \in \cR$ let
-  $\cA(D) = \left\{ 1 \leq b \leq {B_0} : D \subseteq T_b(x) \right\}$
-  be the indices of the trees which are active on subcell $D$,
-  and for $1 \leq b \leq {B_0}$ let
-  $\cA(b) = \left\{ D \in \cR : D \subseteq T_b(x) \right\}$
-  be the subcells which are contained in $T_b(x)$,
-  so that $b \in \cA(D) \iff D \in \cA(b)$.
-  For a subcell $D \in \cR$,
-  write $N_b(D) = \sum_{i=1}^{n} \I \left\{ X_i \in D \right\}$
-  so that $N_b(x) = \sum_{D \in \cA(b)} N_b(D)$.
-  Note that for any $D \in \cR \setminus \cD$,
-  %
-  \begin{align*}
-    &\E \left[
-      \prod_{b=1}^{B_0}
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus \cD}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap \cD}
-          |D'|
-        \right)^{k_b}
-      }
-    \right] \\
-    &\quad=
-    \E \left[
-      \prod_{b \notin \cA(D)}
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus \cD}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap \cD}
-          |D'|
-        \right)^{k_b}
-      } \right. \\
-      &\left.
-      \qquad
-      \times\,\E\left[
-        \prod_{b \in \cA(D)}
-        \frac{1}{
-          1 \vee \left(
-            \sum_{D' \in \cA(b) \setminus \cD}
-            N_b(D')
-            + n \sum_{D' \in \cA(b) \cap \cD}
-            |D'|
-          \right)^{k_b}
-        } \right.\right. \\
-        &\left.\left.
-        \quad\qquad\qquad\biggm|
-        \bT,
-        N_b(D') : D' \in \cR
-        \setminus
-        (\cD \cup \{D\})
-      \right]
-    \right].
-  \end{align*}
-  %
-  Now the inner conditional expectation is over $N_b(D)$ only.
-  Since $f$ is bounded away from zero,
-  %
-  \begin{align*}
-    N_b(D)
-    &\sim \Bin\left(
-      n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \
-      \frac{\int_{D} f(s) \diff s}
-      {1 - \int_{\bigcup \left( \cR \setminus \cD \right) \setminus D}
-      f(s) \diff s}
-    \right) \\
-    &\geq \Bin\left(
-      n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \
-      |D| \inf_{x \in [0,1]^d} f(x)
-    \right)
-  \end{align*}
-  %
-  conditional on $\bT$ and
-  $N_b(D') : D' \in \cR \setminus (\cD \cup \{D\})$.
-  For sufficiently large $t$ by Lemma~\ref{lem:mondrian_app_active_data}
-  %
-  \begin{align*}
-    \P \left(
-      \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D')
-    > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right)
-    &\leq
-    \P \left( N_{\cup}(x) > t^{d+1}
-      \frac{n}{\lambda^d}
-      \|f\|_\infty
-    \right)
-    \leq
-    4 d B_0 e^{-t/4}.
-  \end{align*}
-  %
-  Thus
-  $N_b(D) \geq \Bin(n/2, |D| \inf_x f(x))$
-  conditional on
-  $\left\{ \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right\}$
-  with probability at least
-  $1 - 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}$.
-  So by Lemma~\ref{lem:mondrian_app_binomial_bound},
-  %
-  \begin{align*}
-    &\E \Bigg[
-      \prod_{b \in \cA(D)} \!
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus \cD}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap \cD}
-          |D'|
-        \right)^{k_b}
-      }
-      \biggm|
-      \!
-      \bT,
-      N_b(D')\! : D' \in \cR \setminus \! (\cD \cup \{D\})
-    \Bigg] \\
-    &\quad\leq
-    \E \! \left[
-      \prod_{b \in \cA(D)}
-      \frac{(9k)^{k_b}}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
-          N_b(D')
-          + n |D| \inf_x f(x) / 2
-          + n \sum_{D' \in \cA(b) \cap \cD}
-          |D'|
-      \right)^{k_b}}
-    \right] \\
-    &\qquad+
-    4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \\
-    &\quad\leq
-    \left( \frac{18k}{\inf_x f(x)} \right)^k
-    \! \E \! \left[
-      \prod_{b \in \cA(D)}
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})}
-          |D'|
-      \right)^{k_b}}
-    \right] \\
-    &\qquad+
-    4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}.
-  \end{align*}
-  %
-  Therefore plugging this back into the marginal expectation yields
-  %
-  \begin{align*}
-    &\E\left[
-      \prod_{b=1}^{B_0}
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus \cD}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap \cD}
-          |D'|
-        \right)^{k_b}
-      }
-    \right] \\
-    &\quad\leq
-    \left( \frac{18k}{\inf_x f(x)} \right)^k
-    \E \left[
-      \prod_{b=1}^{B_0}
-      \frac{1}{
-        1 \vee \left(
-          \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
-          N_b(D')
-          + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})}
-          |D'|
-      \right)^{k_b}}
-    \right] \\
-    &\qquad+
-    4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}.
-  \end{align*}
-  %
-  Now we apply induction,
-  starting with $\cD = \emptyset$ and
-  adding $D \in \cR \setminus \cD$ to $\cD$ until
-  $\cD = \cR$.
-  This takes at most $|\cR| \leq 2^{B_0}$ steps and yields
-  %
-  \begin{align*}
-    \E\left[
-      \prod_{b=1}^{B_0}
-      \frac{\I_b(x)}{N_b(x)^{k_b}}
-    \right]
-    &\leq
-    \E\left[
-      \prod_{b=1}^{B_0}
-      \frac{1}{1 \vee N_b(x)^{k_b}}
-    \right]
-    =
-    \E\left[
-      \prod_{b=1}^{B_0}
-      \frac{1}{1 \vee \left( \sum_{D \in \cA(b)} N_b(D) \right)^{k_b}}
-    \right]
-    \leq \cdots \\
-    &\leq
-    \left( \frac{18k}{\inf_x f(x)} \right)^{2^{B_0} k}
-    \left(
-      \prod_{b=1}^{B_0}
-      \,\E \left[
-        \frac{1}{1 \vee (n |T_b(x)|)^{k_b}}
-      \right]
-      + 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}
-    \right),
-  \end{align*}
-  %
-  where the expectation factorizes due to independence of $T_b(x)$.
-  The last step is to remove the trailing exponential term.
-  To do this, note that by Jensen's inequality,
-  %
-  \begin{align*}
-    \prod_{b=1}^{B_0}
-    \,\E \left[
-      \frac{1}{1 \vee (n |T_b(x)|)^{k_b}}
-    \right]
-    &\geq
-    \prod_{b=1}^{B_0}
-    \frac{1}
-    {\E \left[ 1 \vee (n |T_b(x)|)^{k_b} \right]}
-    \geq
-    \prod_{b=1}^{B_0}
-    \frac{1}{n^{k_b}}
-    = n^{-k}
-    \geq
-    4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}
-  \end{align*}
-  %
-  for sufficiently large $n$
-  because $B_0$, $d$, and $k$ are fixed while
-  $\log \lambda \gtrsim \log n$.
-\end{proof}
-
-Now that moments of (products of) $\frac{\I_b(x)}{N_b(x)}$
-have been bounded by moments of
-(products of) $\frac{1}{n |T_b(x)|}$, we establish further
-explicit bounds for these in the next result.
-Note that the problem has been reduced to determining
-properties of Mondrian cells, so once again we return to the
-exact cell shape distribution given by \citet{mourtada2020minimax},
-and evaluate the appropriate expectations by integration.
-Note that the truncation by taking the minimum with one inside the expectation
-is essential here, as otherwise second moment of the inverse Mondrian cell
-volume is not even finite. As such, there is a ``penalty'' of $\log n$
-when bounding truncated second moments,
-and the upper bound for the $k$th moment is significantly
-larger than the naive assumption of $(\lambda^d / n)^k$
-whenever $k \geq 3$.
-This ``small cell'' phenomenon in which the inverse volumes of Mondrian cells
-have heavy tails is a recurring challenge.
-
-\begin{lemma}[Inverse moments of the volume of a Mondrian cell]%
-  \label{lem:mondrian_app_moment_cell}
-
-  Suppose Assumption~\ref{ass:mondrian_estimator} holds
-  and let $T \sim \cM\big([0,1]^d, \lambda\big)$.
-  Then for sufficiently large $n$,
-  %
-  \begin{align*}
-    \E\left[
-      1 \wedge
-      \frac{1}{(n |T(x)|)^k}
-    \right]
-    &\leq
-    \left(
-      \frac{\lambda^d}{n}
-    \right)^{\I \left\{ k = 1 \right\}}
-    \left(
-      \frac{3 \lambda^{2d} \log n}{n^2}
-    \right)^{\I \left\{ k \geq 2 \right\}}
-    \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}.
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_moment_cell}]
-
-  By \citet[Proposition~1]{mourtada2020minimax},
-  $|T(x)| = \prod_{j=1}^{d}
-  \left(
-    \left(\frac{1}{\lambda} E_{j1} \right) \wedge x_j
-    + \left( \frac{1}{\lambda} E_{j2} \right) \wedge (1-x_j)
-  \right)$
-  where $E_{j1}$ and $E_{j2}$
-  are mutually independent $\Exp(1)$ random variables.
-  Thus for $0<t<1$,
-  using the fact that $E_{j1} + E_{j2} \sim \Gam(2, 1)$,
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{1 \vee (n |T(x)|)^k}
-    \right]
-    &\leq
-    \frac{1}{n^k}
-    \E \left[
-      \frac{\I\{\min_j (E_{j1} + E_{j2}) \geq t\}}{|T(x)|^k}
-    \right]
-    + \P \left(\min_{1 \leq j \leq d} (E_{j1} + E_{j2}) < t\right) \\
-    &\leq
-    \frac{1}{n^k}
-    \prod_{j=1}^d
-    \E \left[
-      \frac{\I\{E_{j1} + E_{j2} \geq t\}}
-      {\left(\frac{1}{\lambda} E_{j1} \wedge x_j
-      + \frac{1}{\lambda} E_{j2} \wedge (1-x_j)\right)^k}
-    \right]
-    + d\, \P \left(E_{j1} < t\right) \\
-    &\leq
-    \frac{\lambda^{d k}}{n^k}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)}
-    \E \left[
-      \frac{\I\{E_{j1} + E_{j2} \geq t\}}
-      {(E_{j1} + E_{j2})^k \wedge 1}
-    \right]
-    + d (1 - e^{-t}) \\
-    &\leq
-    \frac{\lambda^{d k}}{n^k}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)}
-    \int_{t}^{1}
-    \frac{e^{-s}}{s^{k-1}}
-    \diff s
-    + d t \\
-    &\leq
-    d t
-    + \frac{\lambda^{d k}}{n^k}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)}
-    \times
-    \begin{cases}
-      1-t & \text{if } k = 1, \\
-      -\log t & \text{if } k = 2.
-    \end{cases}
-  \end{align*}
-  %
-  If $k>2$ we use
-  $\frac{1}{1 \vee (n |T(x)|)^k} \leq \frac{1}{1 \vee (n |T(x)|)^{k-1}}$
-  to reduce $k$. Now if $k = 1$ we let $t \to 0$, giving
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{1 \vee (n |T(x)|)}
-    \right]
-    &\leq
-    \frac{\lambda^d}{n}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)},
-  \end{align*}
-  %
-  and if $k = 2$ then we set $t = 1/n^2$ so that for
-  sufficiently large $n$,
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{1 \vee (n |T(x)|)^2}
-    \right]
-    &\leq
-    \frac{d}{n^2}
-    + \frac{2 \lambda^{2d} \log n}{n^2}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)}
-    \leq
-    \frac{3 \lambda^{2d} \log n}{n^2}
-    \prod_{j=1}^d
-    \frac{1}{x_j(1-x_j)}.
-  \end{align*}
-  %
-  Lower bounds which match up to constants for the first moment and up to
-  logarithmic terms for the second moment are obtained as
-  $\E \left[ 1 \wedge \frac{1}{(n|T(x)|)^2} \right]
-  \geq \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]^2$
-  by Jensen, and
-  %
-  \begin{align*}
-    \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]
-    &\geq \frac{1}{1 + n \E \left[ |T(x)| \right]}
-    \geq \frac{1}{1 + 2^d n / \lambda^d}
-    \gtrsim \frac{\lambda^d}{n}.
-  \end{align*}
-\end{proof}
-
-The endeavor to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ is
-concluded with the next result, combining the previous two lemmas to give a
-bound without expectations on the right.
-
-\begin{lemma}[Simplified generalized moment bound for
-  Mondrian forest denominators]%
-  \label{lem:mondrian_app_simple_moment_denominator}
-  %
-  Suppose Assumptions~\ref{ass:mondrian_data}
-  and \ref{ass:mondrian_estimator} hold.
-  Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$
-  and $k_b \geq 1$ for $1 \leq b \leq B_0$.
-  Then with $k = \sum_{b=1}^{B_0} k_b$,
-  %
-  \begin{align*}
-    &\E\left[
-      \prod_{b=1}^{B_0}
-      \frac{\I_b(x)}{N_b(x)^{k_b}}
-    \right] \\
-    &\quad\leq
-    \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k}
-    \left(
-      \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}
-    \right)^{B_0}
-    \prod_{b=1}^{B_0}
-    \left(
-      \frac{\lambda^d}{n}
-    \right)^{\I \left\{ k_b = 1 \right\}}
-    \left(
-      \frac{\lambda^{2d} \log n}{n^2}
-    \right)^{\I \left\{ k_b \geq 2 \right\}}
-  \end{align*}
-  %
-  for sufficiently large $n$.
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_simple_moment_denominator}]
-  This follows directly from
-  Lemmas~\ref{lem:mondrian_app_moment_denominator} and
-  \ref{lem:mondrian_app_moment_cell}.
-\end{proof}
-
-Our final preliminary lemma is concerned with further properties of
-the inverse truncated binomial distribution, again with the aim
-of analyzing $\frac{\I_b(x)}{N_b(x)}$.
-This time, instead of merely upper bounding the moments,
-we aim to give convergence results for those moments,
-again in terms of moments of $\frac{1}{n |T_b(x)|}$.
-This time we only need to handle the first
-and second moment, so this result does not strictly generalize
-Lemma~\ref{lem:mondrian_app_binomial_bound} except in simple cases.
-The proof is by Taylor's theorem and the Cauchy--Schwarz inequality,
-using explicit expressions for moments of the binomial distribution
-and bounds from Lemma~\ref{lem:mondrian_app_binomial_bound}.
-
-\begin{lemma}[Expectation inequalities for the binomial distribution]%
-  \label{lem:mondrian_app_binomial_expectation}
-  Let $N \sim \Bin(n, p)$ and take $a, b \geq 1$. Then
-  %
-  \begin{align*}
-    0
-    &\leq
-    \E \left[
-      \frac{1}{N+a}
-    \right]
-    - \frac{1}{n p+a}
-    \leq
-    \frac{2^{19}}{(n p+a)^2}, \\
-    0
-    &\leq
-    \E \left[
-      \frac{1}{(N+a)(N+b)}
-    \right]
-    - \frac{1}{(n p+a)(n p+b)}
-    \leq
-    \frac{2^{27}}{(n p +a)(n p +b)}
-    \left(
-      \frac{1}{n p + a}
-      + \frac{1}{n p + b}
-    \right).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_expectation}]
-
-  For the first result,
-  Taylor's theorem with Lagrange remainder
-  for $N \mapsto \frac{1}{N+a}$ around $n p$ gives
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{N+a}
-    \right]
-    &=
-    \E \left[
-      \frac{1}{n p+a}
-      - \frac{N - n p}{(n p+a)^2}
-      + \frac{(N - n p)^2}{(\xi+a)^3}
-    \right]
-  \end{align*}
-  %
-  for some $\xi$ between $n p$ and $N$. The second term in the expectation
-  is zero-mean, showing the non-negativity part, and the
-  Cauchy--Schwarz inequality for the remaining term gives
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{N+a}
-    \right]
-    - \frac{1}{n p+a}
-    &\leq
-    \E \left[
-      \frac{(N - n p)^2}{(n p+a)^3}
-      + \frac{(N - n p)^2}{(N+a)^3}
-    \right] \\
-    &\leq
-    \frac{\E\big[(N - n p)^2\big]}{(n p+a)^3}
-    + \sqrt{
-      \E\big[(N - n p)^4\big]
-      \E \left[
-        \frac{1}{(N+a)^6}
-    \right]}.
-  \end{align*}
-  %
-  Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$
-  and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{N+a}
-    \right]
-    - \frac{1}{n p+a}
-    &\leq
-    \frac{n p}{(n p+a)^3}
-    + \sqrt{\frac{54^6 n p(1+3 n p)}{(n p + a)^6}}
-    \leq
-    \frac{2^{19}}{(n p+a)^2}.
-  \end{align*}
-  %
-  For the second result,
-  Taylor's theorem applied to $N \mapsto \frac{1}{(N+a)(N+b)}$
-  around $n p$ gives
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{(N+a)(N+b)}
-    \right]
-    &=
-    \E \left[
-      \frac{1}{(n p+a)(n p + b)}
-      - \frac{(N - n p)(2 n p + a + b)}{(n p + a)^2 (n p + b)^2}
-    \right] \\
-    &\quad+
-    \E \left[
-      \frac{(N - n p)^2}{(\xi+a)(\xi+b)}
-      \left(
-        \frac{1}{(\xi + a)^2}
-        + \frac{1}{(\xi + a)(\xi + b)}
-        + \frac{1}{(\xi + b)^2}
-      \right)
-    \right]
-  \end{align*}
-  %
-  for some $\xi$ between $n p$ and $N$. The second term on the right is
-  zero-mean, showing non-negativity, and applying the Cauchy--Schwarz
-  inequality to the remaining term gives
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{1}{(N+a)(N+b)}
-    \right]
-    - \frac{1}{n p+a} \\
-    &\quad\leq
-    \E \left[
-      \frac{2 (N - n p)^2}{(N+a)(N+b)}
-      \left(
-        \frac{1}{(N + a)^2}
-        + \frac{1}{(N + b)^2}
-      \right)
-    \right] \\
-    &\qquad+
-    \E \left[
-      \frac{2 (N - n p)^2}{(n p +a)(n p +b)}
-      \left(
-        \frac{1}{(n p + a)^2}
-        + \frac{1}{(n p + b)^2}
-      \right)
-    \right] \\
-    &\quad\leq
-    \sqrt{
-      4 \E \left[ (N - n p)^4 \right]
-      \E \left[
-        \frac{1}{(N + a)^6 (N+b)^2}
-        + \frac{1}{(N + b)^6 (N+a)^2}
-    \right]} \\
-    &\qquad+
-    \frac{2 \E\big[(N - n p)^2\big]}{(n p +a)(n p +b)}
-    \left(
-      \frac{1}{(n p + a)^2}
-      + \frac{1}{(n p + b)^2}
-    \right).
-  \end{align*}
-  %
-  Now we use
-  $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$
-  and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that
-  %
-  \begin{align*}
-    \E \left[
-      \frac{1}{(N+a)(N+b)}
-    \right]
-    - \frac{1}{n p+a}
-    &\leq
-    \sqrt{
-      \frac{4n p (1 + 3n p) \cdot 72^8}{(n p + a)^2 (n p + b)^2}
-      \left(
-        \frac{1}{(n p + a)^4}
-        + \frac{1}{(n p + b)^4}
-    \right)} \\
-    &\quad+
-    \frac{2 n p}{(n p +a)(n p +b)}
-    \left(
-      \frac{1}{(n p + a)^2}
-      + \frac{1}{(n p + b)^2}
-    \right) \\
-    &\leq
-    \frac{2^{27}}{(n p + a) (n p + b)}
-    \left(
-      \frac{1}{n p + a}
-      + \frac{1}{n p + b}
-    \right).
-  \end{align*}
-  %
-\end{proof}
-
-\section{Proofs of main results}
-\label{sec:mondrian_app_proofs}
-
-\subsection{Mondrian random forests}
-
-We give rigorous proofs of the central limit theorem,
-bias characterization, and variance estimation
-results for the Mondrian random forest estimator without debiasing.
-See Section~\ref{sec:mondrian_overview_proofs} in the main text
-for details on our approaches to these proofs.
-
-\begin{proof}[Theorem~\ref{thm:mondrian_clt}]
-  From the debiased version
-  (Theorem~\ref{thm:mondrian_clt_debiased}) with $J=0$, $a_0 = 1$, and
-  $\omega_0 = 1$.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_bias}]
-
-  \proofparagraph{removing the dependence on the trees}
-
-  By measurability and with $\mu(X_i) = \E[Y_i \mid X_i]$ almost surely,
-  %
-  \begin{align*}
-    \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-    - \mu(x)
-    &=
-    \frac{1}{B}
-    \sum_{b=1}^B
-    \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
-    \frac{\I_{i b}(x)}{N_b(x)}.
-  \end{align*}
-  %
-  Conditional on $\bX$,
-  the terms in the outer sum depend only on $T_b$ so are i.i.d.
-  As $\mu$ is Lipschitz,
-  %
-  \begin{align*}
-    &\Var \big[
-      \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-      - \mu(x)
-      \mid \bX
-    \big]
-    \leq
-    \frac{1}{B}
-    \E \left[
-      \left(
-        \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
-        \frac{\I_{i b}(x)}{N_b(x)}
-      \right)^2
-      \Bigm| \bX
-    \right] \\
-    &\quad\lesssim
-    \frac{1}{B}
-    \E \left[
-      \max_{1 \leq i \leq n}
-      \big\| X_i - x \big\|_2^2
-      \left(
-        \sum_{i=1}^n
-        \frac{\I_{i b}(x)}{N_b(x)}
-      \right)^2
-      \Bigm| \bX
-    \right]
-    \lesssim
-    \frac{1}{B}
-    \sum_{j=1}^{d}
-    \E \left[
-      |T(x)_j|^2
-    \right]
-    \lesssim
-    \frac{1}{\lambda^2 B},
-  \end{align*}
-  %
-  using the law of $T(x)_j$ from \citet[Proposition~1]{mourtada2020minimax}.
-  By Chebyshev's inequality,
-  %
-  \begin{align*}
-    \big|
-    \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-    - \E \left[ \hat \mu(x) \mid \bX \right]
-    \big|
-    &\lesssim_\P
-    \frac{1}{\lambda \sqrt B}.
-  \end{align*}
-
-  \proofparagraph{showing the conditional bias converges in probability}
-
-  Now $\E \left[ \hat\mu(x) \mid \bX \right]$
-  is a non-linear function of the i.i.d.\ random variables $X_i$,
-  so we use the Efron--Stein inequality
-  \citep{efron1981jackknife} to bound its variance.
-  Let $\tilde X_{i j} = X_i$ if $i \neq j$ and be an
-  independent copy of $X_j$, denoted $\tilde X_j$, if $i = j$.
-  Write $\tilde \bX_j = (\tilde X_{1j}, \ldots, \tilde X_{n j})$
-  and similarly
-  $\tilde \I_{i j b}(x) = \I \big\{ \tilde X_{i j} \in T_b(x) \big\}$
-  and $N_{j b}(x) = \sum_{i=1}^{n} \tilde \I_{i j b}(x)$.
-  %
-  \begin{align}
-    \nonumber
-    &\Var \left[
-      \sum_{i=1}^{n}
-      \big( \mu(X_i) - \mu(x) \big)
-      \E \left[
-        \frac{\I_{i b}(x)}{N_b(x)}
-        \Bigm| \bX
-      \right]
-    \right] \\
-    \nonumber
-    &\quad\leq
-    \frac{1}{2}
-    \sum_{j=1}^{n}
-    \E \! \left[
-      \! \left(
-        \sum_{i=1}^{n}
-        \big( \mu(X_i) - \mu(x) \big)
-        \E \! \left[
-          \frac{\I_{i b}(x)}{N_b(x)}
-          \Bigm| \bX
-        \right]
-        - \sum_{i=1}^{n}
-        \left( \mu(\tilde X_{i j}) - \mu(x) \right)
-        \E \! \left[
-          \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)}
-          \Bigm| \tilde \bX_j
-        \right]
-      \right)^{\! \! 2}
-    \right] \\
-    \nonumber
-    &\quad\leq
-    \frac{1}{2}
-    \sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \sum_{i=1}^{n}
-        \left(
-          \big( \mu(X_i) - \mu(x) \big)
-          \frac{\I_{i b}(x)}{N_b(x)}
-          - \left( \mu(\tilde X_{i j}) - \mu(x) \right)
-          \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)}
-        \right)
-      \right)^2
-    \right] \\
-    \nonumber
-    &\quad\leq
-    \sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \sum_{i \neq j}
-        \big( \mu(X_i) - \mu(x) \big)
-        \left(
-          \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)}
-        \right)
-      \right)^{\!\!2} \,
-    \right] \\
-    \label{eq:mondrian_app_bias_efron_stein}
-    &\qquad+
-    2 \sum_{j=1}^{n}
-    \E \left[
-      \left( \mu(X_j) - \mu(x) \right)^2
-      \frac{\I_{j b}(x)}{N_b(x)^2}
-    \right].
-  \end{align}
-  %
-  For the first term in \eqref{eq:mondrian_app_bias_efron_stein} to be non-zero,
-  we must have $|N_b(x) - \tilde N_{j b}(x)| = 1$.
-  Writing $N_{-j b}(x) = \sum_{i \neq j} \I_{i b}(x)$,
-  assume by symmetry that
-  $\tilde N_{j b}(x) = N_{-j b}(x)$ and $N_b(x) = N_{-j b}(x) + 1$,
-  and $\I_{j b}(x) = 1$.
-  As $f$ is bounded and $\mu$ is Lipschitz,
-  writing $\I_{-j b}(x) = \I \left\{ N_{-j b}(x) \geq 1 \right\}$,
-  %
-  \begin{align*}
-    &\sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \sum_{i \neq j}
-        \left( \mu(X_i) - \mu(x) \right)
-        \left(
-          \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)}
-        \right)
-      \right)^{\! 2} \,
-    \right] \\
-    &\quad\lesssim
-    \sum_{j=1}^{n}
-    \E \left[
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|^2
-      \left(
-        \frac{\sum_{i \neq j}\I_{i b}(x) \I_{j b}(x)}
-        {N_{-j b}(x)(N_{-j b}(x) + 1)}
-      \right)^2
-    \right]
-    \lesssim
-    \E \left[
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|^2
-      \frac{\I_{b}(x)}{N_{b}(x)}
-    \right].
-  \end{align*}
-  %
-  For $t > 0$, partition by
-  $\left\{ \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right\}$
-  and apply Lemma~\ref{lem:mondrian_app_largest_cell} and
-  Lemma~\ref{lem:mondrian_app_simple_moment_denominator}:
-  %
-  \begin{align*}
-    \E \left[
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|^2
-      \frac{\I_{b}(x)}{N_{b}(x)}
-    \right]
-    &\leq
-    \P \left(
-      \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda
-    \right)
-    + (t / \lambda)^2\,
-    \E \left[
-      \frac{\I_{b}(x)}{N_{b}(x)}
-    \right] \\
-    &\lesssim
-    e^{-t/2}
-    + \left( \frac{t}{\lambda} \right)^2
-    \frac{\lambda^d}{n}
-    \lesssim
-    \frac{1}{n^2}
-    + \frac{(\log n)^2}{\lambda^2}
-    \frac{\lambda^d}{n}
-    \lesssim
-    \frac{(\log n)^2}{\lambda^2}
-    \frac{\lambda^{d}}{n},
-  \end{align*}
-  %
-  where we set $t = 4 \log n$.
-  For the second term in \eqref{eq:mondrian_app_bias_efron_stein} we have
-  %
-  \begin{align*}
-    \sum_{j=1}^{n}
-    \E \left[
-      \left( \mu(X_j) - \mu(x) \right)^2
-      \frac{\I_{j b}(x)}{N_b(x)^2}
-    \right]
-    &\lesssim
-    \E \left[
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|^{2}
-      \frac{\I_{b}(x)}{N_b(x)}
-    \right]
-    \lesssim
-    \frac{(\log n)^2}{\lambda^2}
-    \frac{\lambda^{d}}{n}
-  \end{align*}
-  %
-  in the same manner.
-  Hence
-  %
-  \begin{align*}
-    \Var \left[
-      \sum_{i=1}^{n}
-      \left( \mu(X_i) - \mu(x) \right)
-      \E \left[
-        \frac{\I_{i b}(x)}{N_b(x)}
-        \Bigm| \bX
-      \right]
-    \right]
-    &\lesssim
-    \frac{(\log n)^2}{\lambda^2}
-    \frac{\lambda^{d}}{n},
-  \end{align*}
-  %
-  and so by Chebyshev's inequality,
-  %
-  \begin{align*}
-    \big|
-    \E \left[ \hat \mu(x) \mid \bX, \bT \right]
-    - \E \left[ \hat \mu(x) \right]
-    \big|
-    &\lesssim_\P
-    \frac{1}{\lambda \sqrt B}
-    + \frac{\log n}{\lambda}
-    \sqrt{ \frac{\lambda^{d}}{n} }.
-  \end{align*}
-
-  \proofparagraph{computing the limiting bias}
-
-  It remains to compute the limit of
-  $\E \left[ \hat \mu(x) \right] - \mu(x)$.
-  Let $\bX_{-i} = (X_1, \ldots, X_{i-1}, X_{i+1}, \ldots, X_n)$
-  and $N_{-i b}(x) = \sum_{j=1}^n \I\{j \neq i\} \I\{X_j \in T_b(x)\}$.
-  Then
-  %
-  \begin{align*}
-    &\E \left[ \hat \mu(x) \right]
-    - \mu(x)
-    =
-    \E \left[
-      \sum_{i=1}^{n}
-      \left( \mu(X_i) - \mu(x) \right)
-      \frac{\I_{i b}(x)}{N_b(x)}
-    \right] \\
-    &\quad=
-    \sum_{i=1}^{n}
-    \E \left[
-      \E \left[
-        \frac{\left( \mu(X_i) - \mu(x) \right)\I_{i b}(x)}
-        {N_{-i b}(x) + 1}
-        \bigm| \bT, \bX_{-i}
-      \right]
-    \right]
-    = n \,
-    \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {N_{-i b}(x) + 1}
-    \right].
-  \end{align*}
-  %
-  By Lemma~\ref{lem:mondrian_app_binomial_expectation}, as
-  $N_{-i b}(x) \sim \Bin\left(n-1,
-  \int_{T_b(x)} f(s) \diff s \right)$
-  given $\bT$ and $f$ is bounded below,
-  %
-  \begin{align*}
-    \left|
-    \E \! \left[
-      \frac{1}{N_{-i b}(x) + 1}
-      \Bigm| \bT
-    \right]
-    - \frac{1}{(n-1) \! \int_{T_b(x)} \! f(s) \diff s + 1}
-    \right|
-    &\lesssim
-    \frac{1}{n^2 \! \left( \int_{T_b(x)} f(s) \diff s \right)^2}
-    \wedge 1
-    \lesssim
-    \frac{1}{n^2 |T_b(x)|^2}
-    \wedge 1,
-  \end{align*}
-  %
-  and also
-  %
-  \begin{align*}
-    \left|
-    \frac{1}{(n-1) \int_{T_b(x)} f(s) \diff s + 1}
-    - \frac{1}{n \int_{T_b(x)} f(s) \diff s}
-    \right|
-    &\lesssim
-    \frac{1}{n^2 \left( \int_{T_b(x)} f(s) \diff s\right)^2}
-    \wedge 1
-    \lesssim
-    \frac{1}{n^2 |T_b(x)|^2}
-    \wedge 1.
-  \end{align*}
-  %
-  So by Lemmas~\ref{lem:mondrian_app_largest_cell}
-  and \ref{lem:mondrian_app_moment_cell},
-  since $f$ is Lipschitz and bounded, using Cauchy--Schwarz,
-  %
-  \begin{align*}
-    &\left|
-    \E \left[ \hat \mu(x) \right]
-    - \mu(x)
-    - \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {\int_{T_b(x)} f(s) \diff s}
-    \right]
-    \right|
-    \lesssim
-    \E \left[
-      \frac{n \int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
-      {n^2 |T_b(x)|^2 \vee 1}
-    \right] \\
-    &\qquad\lesssim
-    \E \left[
-      \frac{\max_{1 \leq l \leq d} |T_b(x)_l| }
-      {n |T_b(x)| \vee 1}
-    \right] \\
-    &\qquad\lesssim
-    \frac{2 \log n}{\lambda} \,
-    \E \left[
-      \frac{1}{n |T_b(x)| \vee 1}
-    \right]
-    + \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| >
-    \frac{2 \log n}{\lambda} \right)^{1/2}
-    \E \left[
-      \frac{1}
-      {n^2 |T_b(x)|^2 \vee 1}
-    \right]^{1/2} \\
-    &\qquad\lesssim
-    \frac{\log n}{\lambda} \,
-    \frac{\lambda^d}{n}
-    + \frac{d}{n}
-    \frac{\lambda^d \sqrt{\log n}}{n}
-    \lesssim
-    \frac{\log n}{\lambda} \,
-    \frac{\lambda^d}{n}.
-  \end{align*}
-  %
-  Next set
-  $A = \frac{1}{f(x) |T_b(x)|} \int_{T_b(x)} (f(s) - f(x)) \diff s
-  \geq \inf_{s \in [0,1]^d} \frac{f(s)}{f(x)} - 1$.
-  Use the Maclaurin series of $\frac{1}{1+x}$
-  up to order $\flbeta$ to see
-  $\frac{1}{1 + A} = \sum_{k=0}^{\flbeta} (-1)^k A^k
-  + O \left( A^{\flbeta + 1} \right)$.
-  Hence
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {\int_{T_b(x)} f(s) \diff s}
-    \right]
-    =
-    \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {f(x) |{T_b(x)}|}
-      \frac{1}{1 + A}
-    \right] \\
-    &\quad=
-    \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {f(x) |{T_b(x)}|}
-      \left(
-        \sum_{k=0}^{\flbeta}
-        (-1)^k
-        A^k
-        + O \left( |A|^{\flbeta + 1} \right)
-      \right)
-    \right].
-  \end{align*}
-  %
-  Note that since $f$ and $\mu$ are Lipschitz,
-  and by integrating the tail probability given in
-  Lemma~\ref{lem:mondrian_app_largest_cell}, the Maclaurin remainder term is
-  bounded by
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
-      {f(x) |{T_b(x)}|}
-      |A|^{\flbeta + 1}
-    \right] \\
-    &\qquad=
-    \E \left[
-      \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
-      {f(x) |{T_b(x)}|}
-      \left(
-        \frac{1}{f(x) |{T_b(x)}|} \int_{T_b(x)} (f(s) - f(x)) \diff s
-      \right)^{\flbeta + 1}
-    \right] \\
-    &\qquad\lesssim
-    \E \left[
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|^{\flbeta+2}
-    \right]
-    =
-    \int_{0}^{\infty}
-    \P \left(
-      \max_{1 \leq l \leq d}
-      |T_b(x)_l|
-      \geq t^{\frac{1}{\flbeta+2}}
-    \right)
-    \diff t
-    \leq
-    \int_{0}^{\infty}
-    2 d e^{- \lambda t^{\frac{1}{\flbeta+2}} / 2}
-    \diff t \\
-    &\qquad=
-    \frac{2^{\flbeta + 3} d (\flbeta + 2)! }
-    {\lambda^{\flbeta + 2}}
-    \lesssim
-    \frac{1}{\lambda^{\beta}},
-  \end{align*}
-  %
-  since $\int_0^\infty e^{-a x^{1/k}} \diff x
-  = a^{-k} k!$.
-  To summarize the progress so far, we have
-  %
-  \begin{align*}
-    &\left|
-    \E \left[
-      \hat \mu(x)
-    \right]
-    - \mu(x)
-    - \sum_{k=0}^{\flbeta}
-    (-1)^k \,
-    \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {f(x)^{k+1} |T_b(x)|^{k+1}}
-      \left(
-        \int_{T_b(x)} (f(s) - f(x)) \diff s
-      \right)^k
-    \right]
-    \right| \\
-    &\qquad\lesssim
-    \frac{\log n}{\lambda}
-    \frac{\lambda^d}{n}
-    + \frac{1}{\lambda^\beta}.
-  \end{align*}
-  %
-  We evaluate the expectation.
-  By Taylor's theorem, with $\nu$ a multi-index,
-  as $f \in \cH^\beta$,
-  %
-  \begin{align*}
-    \left(
-      \int_{T_b(x)} (f(s) - f(x)) \diff s
-    \right)^k
-    &=
-    \left(
-      \sum_{|\nu| = 1}^\flbeta
-      \frac{\partial^\nu f(x)}{\nu !}
-      \! \int_{T_b(x)}
-      \!\! (s - x)^\nu
-      \diff s
-    \right)^k
-    + O \! \left(
-      |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta
-    \right).
-  \end{align*}
-  %
-  Next, by the multinomial theorem
-  with a multi-index $u$ indexed by $\nu$ with $|\nu| \geq 1$,
-  %
-  \begin{align*}
-    \left(
-      \sum_{|\nu| = 1}^\flbeta
-      \frac{\partial^\nu f(x)}{\nu !}
-      \int_{T_b(x)}
-      (s - x)^\nu
-      \diff s
-    \right)^k
-    &=
-    \sum_{|u| = k}
-    \binom{k}{u}
-    \left(
-      \frac{\partial^\nu f(x)}{\nu !}
-      \int_{T_b(x)} (s-x)^\nu \diff s
-    \right)^u
-  \end{align*}
-  %
-  where $\binom{k}{u}$ is a multinomial coefficient.
-  By Taylor's theorem with $f, \mu \in \cH^\beta$,
-  %
-  \begin{align*}
-    &\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s \\
-    &\quad=
-    \sum_{|\nu'|=1}^{\flbeta}
-    \sum_{|\nu''|=0}^{\flbeta}
-    \frac{\partial^{\nu'} \mu(x)}{\nu' !}
-    \frac{\partial^{\nu''} f(x)}{\nu'' !}
-    \int_{T_b(x)} (s-x)^{\nu' + \nu''} \diff s
-    + O \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right).
-  \end{align*}
-  %
-  Now by integrating the tail probabilities in
-  Lemma~\ref{lem:mondrian_app_largest_cell},
-  $ \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right]
-  \lesssim \frac{1}{\lambda^\beta}$.
-  Therefore, by Lemma~\ref{lem:mondrian_app_moment_cell},
-  writing $T_b(x)^\nu$ for $\int_{T_b(x)} (s-x)^\nu \diff s$,
-  %
-  \begin{align*}
-    &\sum_{k=0}^{\flbeta}
-    (-1)^k \,
-    \E \left[
-      \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
-      {f(x)^{k+1} |T_b(x)|^{k+1}}
-      \left(
-        \int_{T_b(x)} (f(s) - f(x)) \diff s
-      \right)^k
-    \right] \\
-    &\,=
-    \! \sum_{k=0}^{\flbeta}
-    (-1)^k \,
-    \E \!
-    \left[
-      \! \frac{
-        \sum_{|\nu'|=1}^{\flbeta}
-        \! \sum_{|\nu''|=0}^{\flbeta}
-        \! \frac{\partial^{\nu'} \mu(x)}{\nu' !}
-        \frac{\partial^{\nu''} f(x)}{\nu'' !}
-        T_b(x)^{\nu' + \nu''\!\!\!}
-      }{f(x)^{k+1} |T_b(x)|^{k+1}}
-      \!\! \sum_{|u| = k}
-      \! \binom{k}{u}
-      \!\!
-      \left(
-        \frac{\partial^\nu f(x)}{\nu !}
-        T_b(x)^\nu
-      \right)^{\!\! u}
-    \right]
-    \! + O \! \left(
-      \frac{1}{\lambda^\beta}
-    \right) \\
-    &\,=
-    \sum_{|\nu'|=1}^{\flbeta}
-    \sum_{|\nu''|=0}^{\flbeta}
-    \sum_{|u|=0}^{\flbeta}
-    \frac{\partial^{\nu'} \mu(x)}{\nu' !}
-    \frac{\partial^{\nu''} f(x)}{\nu'' !}
-    \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u
-    \binom{|u|}{u}
-    \frac{(-1)^{|u|}}{f(x)^{|u|+1}}
-    \E \left[
-      \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}}
-    \right] \\
-    &\quad+
-    O \left(
-      \frac{1}{\lambda^\beta}
-    \right) .
-  \end{align*}
-  %
-  We show this is a polynomial in $1/\lambda$.
-  For $1 \leq j \leq d$, define
-  $E_{1j*} \sim \Exp(1) \wedge (\lambda x_j)$
-  and $E_{2j*} \sim \Exp(1) \wedge (\lambda (1-x_j))$
-  independent so
-  $T_b(x) = \prod_{j=1}^{d} [x_j - E_{1j*} / \lambda, x_j + E_{2j*} / \lambda]$.
-  Then
-  %
-  \begin{align*}
-    T_b(x)^\nu
-    &=
-    \int_{T_b(x)} (s-x)^\nu \diff s
-    = \prod_{j=1}^d
-    \int_{x_j - E_{1j*}/\lambda}^{x_j+E_{2j*}/\lambda}
-    (s - x_j)^{\nu_j} \diff s
-    = \prod_{j=1}^d
-    \int_{-E_{1j*}}^{E_{2j*}} (s / \lambda)^{\nu_j} 1/\lambda \diff s \\
-    &=
-    \lambda^{-d - |\nu|}
-    \prod_{j=1}^d
-    \int_{-E_{1j*}}^{E_{2j*}} s^{\nu_j} \diff s
-    = \lambda^{-d - |\nu|}
-    \prod_{j=1}^d
-    \frac{E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}}
-    {\nu_j + 1}.
-  \end{align*}
-  %
-  So by independence over $j$,
-  %
-  \begin{align}
-    \label{eq:mondrian_app_bias_calc}
-    &\E \left[
-      \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}}
-    \right] \\
-    \nonumber
-    &\quad=
-    \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u}
-    \prod_{j=1}^d
-    \E \left[
-      \frac{E_{2j*}^{\nu'_j + \nu''_j + 1}
-      + (-1)^{\nu'_j + \nu''_j} E_{1j*}^{\nu'_j + \nu''_j + 1}}
-      {(\nu'_j + \nu''_j + 1) (E_{2j*} + E_{1j*})}
-      \frac{\left(E_{2j*}^{\nu_j + 1}
-      + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}\right)^u}
-      {(\nu_j + 1)^u (E_{2j*} + E_{1j*})^{|u|}}
-    \right].
-  \end{align}
-  %
-  The final step is to replace $E_{1j*}$
-  by $E_{1j} \sim \Exp(1)$ and similarly for $E_{2j*}$.
-  For some $C > 0$,
-  %
-  \begin{align*}
-    \P \! \left(
-      \bigcup_{j=1}^{d}
-      \left(
-        \left\{
-          E_{1j*} \neq E_{1j}
-        \right\}
-        \cup
-        \left\{
-          E_{2j*} \neq E_{2j}
-        \right\}
-      \right)
-    \! \right)
-    &\leq
-    2d\,
-    \P \! \left(
-      \Exp(1) \geq \lambda \min_{1 \leq j \leq d}
-      (x_j \wedge (1-x_j))
-    \! \right)
-    \leq
-    2d e^{-C \lambda}.
-  \end{align*}
-  %
-  Further, the quantity inside the expectation in
-  \eqref{eq:mondrian_app_bias_calc}
-  is bounded almost surely by one and so
-  the error incurred by replacing
-  $E_{1j*}$ and $E_{2j*}$ by $E_{1j}$ and $E_{2j}$
-  in \eqref{eq:mondrian_app_bias_calc}
-  is at most $2 d e^{-C \lambda} \lesssim \lambda^{-\beta}$.
-  Thus the limiting bias is
-  %
-  \begin{align}
-    \nonumber
-    &\E \left[ \hat \mu(x) \right]
-    - \mu(x) \\
-    \nonumber
-    &\quad=
-    \sum_{|\nu'|=1}^{\flbeta}
-    \sum_{|\nu''|=0}^{\flbeta}
-    \sum_{|u|=0}^{\flbeta}
-    \frac{\partial^{\nu'} \mu(x)}{\nu' !}
-    \frac{\partial^{\nu''} f(x)}{\nu'' !}
-    \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u
-    \binom{|u|}{u}
-    \frac{(-1)^{|u|}}{f(x)^{|u|+1}}
-    \, \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \\
-    \nonumber
-    &\qquad\quad\times
-    \prod_{j=1}^d
-    \E \left[
-      \frac{E_{2j}^{\nu'_j + \nu''_j + 1}
-      + (-1)^{\nu'_j + \nu''_j} E_{1j}^{\nu'_j + \nu''_j + 1}}
-      {(\nu'_j + \nu''_j + 1) (E_{2j} + E_{1j})}
-      \frac{\left(E_{2j}^{\nu_j + 1}
-      + (-1)^{\nu_j} E_{1j}^{\nu_j + 1}\right)^u}
-      {(\nu_j + 1)^u (E_{2j} + E_{1j})^{|u|}}
-    \right] \\
-    \label{eq:mondrian_app_bias}
-    &\qquad+
-    O \left( \frac{\log n}{\lambda} \frac{\lambda^d}{n} \right)
-    + O \left( \frac{1}{\lambda^\beta} \right),
-  \end{align}
-  %
-  recalling that $u$ is a multi-index which is indexed by the multi-index $\nu$.
-  This is a polynomial in $\lambda$ of degree at most $\flbeta$,
-  since higher-order terms can be absorbed into $O(1 / \lambda^\beta)$,
-  which has finite coefficients depending only on
-  the derivatives up to order $\flbeta$ of $f$ and $\mu$ at $x$.
-  Now we show that the odd-degree terms in this polynomial are all zero.
-  Note that a term is of odd degree if and only if
-  $|\nu'| + |\nu''| + |\nu| \cdot u$ is odd.
-  This implies that there exists $1 \leq j \leq d$ such that
-  exactly one of either
-  $\nu'_j + \nu''_j$ is odd or
-  $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd.
-
-  If $\nu'_j + \nu''_j$ is odd, then
-  $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is even, so
-  $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is even.
-  Consider the effect of swapping $E_{1j}$ and $E_{2j}$,
-  an operation which preserves their joint law, in each of
-  %
-  \begin{align}
-    \label{eq:mondrian_app_bias_odd_1}
-    \frac{E_{2j}^{\nu'_j + \nu''_j + 1}
-    - (-E_{1j})^{\nu'_j + \nu''_j + 1}}
-    {E_{2j} + E_{1j}}
-  \end{align}
-  %
-  and
-  %
-  \begin{align}
-    \label{eq:mondrian_app_bias_odd_2}
-    &\frac{\left(E_{2j}^{\nu_j + 1}
-    - (-E_{1j})^{\nu_j + 1}\right)^u}
-    {(E_{2j} + E_{1j})^{|u|}}
-    = \!\!\!
-    \prod_{\substack{|\nu| = 1 \\
-    \nu_j u_\nu \text{ even}}}^\beta
-    \!\!\!
-    \frac{\left(E_{2j}^{\nu_j + 1}
-    - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}}
-    {(E_{2j} + E_{1j})^{u_\nu}}
-    \!\!\!
-    \prod_{\substack{|\nu| = 1 \\
-    \nu_j u_\nu \text{ odd}}}^\beta
-    \!\!\!
-    \frac{\left(E_{2j}^{\nu_j + 1}
-    - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}}
-    {(E_{2j} + E_{1j})^{u_\nu}}.
-  \end{align}
-  %
-  Clearly, $\nu'_j + \nu''_j$ being odd inverts the
-  sign of \eqref{eq:mondrian_app_bias_odd_1}.
-  For \eqref{eq:mondrian_app_bias_odd_2},
-  each term in the first product has either
-  $\nu_j$ even or $u_\nu$ even, so its sign is preserved.
-  Every term in the second product of \eqref{eq:mondrian_app_bias_odd_2}
-  has its sign inverted due to both $\nu_j$ and $u_\nu$ being odd,
-  but there are an even number of terms,
-  preserving the overall sign.
-  Therefore the expected product
-  of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2}
-  is zero by symmetry.
-
-  If however $\nu'_j + \nu''_j$ is even, then
-  $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd so
-  $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is odd.
-  Clearly, the sign of \eqref{eq:mondrian_app_bias_odd_1} is preserved.
-  Again the sign of the first product in \eqref{eq:mondrian_app_bias_odd_2}
-  is preserved, and the sign of every term in \eqref{eq:mondrian_app_bias_odd_2}
-  is inverted. However there are now an odd number of terms in the
-  second product, so its overall sign is inverted.
-  Therefore the expected product
-  of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2}
-  is again zero.
-
-  \proofparagraph{calculating the second-order bias}
-
-  Next we calculate some special cases, beginning with
-  the form of the leading second-order bias,
-  where the exponent in $\lambda$ is
-  $|\nu'| + |\nu''| + u \cdot |\nu| = 2$,
-  proceeding by cases on the values of $|\nu'|$, $|\nu''|$, and $|u|$.
-  Firstly, if $|\nu'| = 2$ then $|\nu''| = |u| = 0$.
-  Note that if any $\nu'_j = 1$ then the expectation in
-  \eqref{eq:mondrian_app_bias} is zero.
-  Hence we can assume $\nu'_j \in \{0, 2\}$, yielding
-  %
-  \begin{align*}
-    \frac{1}{2 \lambda^2}
-    \! \sum_{j=1}^d
-    \frac{\partial^2 \mu(x)}{\partial x_j^2}
-    \frac{1}{3}
-    \E \! \left[
-      \frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}}
-    \right]
-    &\!=
-    \frac{1}{2 \lambda^2}
-    \! \sum_{j=1}^d
-    \frac{\partial^2 \mu(x)}{\partial x_j^2}
-    \frac{1}{3}
-    \E \! \left[
-      E_{1j}^{2}
-      + E_{2j}^{2}
-      - E_{1j} E_{2j}
-    \right]
-    = \frac{1}{2 \lambda^2}
-    \! \sum_{j=1}^d
-    \frac{\partial^2 \mu(x)}{\partial x_j^2},
-  \end{align*}
-  %
-  where we used that $E_{1j}$ and $E_{2j}$ are independent $\Exp(1)$.
-  Next we consider $|\nu'| = 1$ and $|\nu''| = 1$, so $|u| = 0$.
-  Note that if $\nu'_j = \nu''_{j'} = 1$ with $j \neq j'$ then the
-  expectation in \eqref{eq:mondrian_app_bias} is zero.
-  So we need only consider $\nu'_j = \nu''_j = 1$, giving
-  %
-  \begin{align*}
-    \frac{1}{\lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}
-    \frac{1}{3}
-    \E \left[
-      \frac{E_{2j}^{3} + E_{1j}^{3}}
-      {E_{2j} + E_{1j}}
-    \right]
-    &=
-    \frac{1}{\lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}.
-  \end{align*}
-  %
-  Finally, we have the case where $|\nu'| = 1$, $|\nu''| = 0$
-  and $|u|=1$. Then $u_\nu = 1$ for some $|\nu| = 1$ and zero otherwise.
-  Note that if $\nu'_j = \nu_{j'} = 1$ with $j \neq j'$ then the
-  expectation is zero. So we need only consider $\nu'_j = \nu_j = 1$, giving
-  %
-  \begin{align*}
-    &- \frac{1}{\lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}
-    \frac{1}{4}
-    \E \left[
-      \frac{(E_{2j}^2 - E_{1j}^2)^2}
-      {(E_{2j} + E_{1j})^2}
-    \right] \\
-    &\quad=
-    - \frac{1}{4 \lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}
-    \E \left[
-      E_{1j}^2
-      + E_{2j}^2
-      - 2 E_{1j} E_{2j}
-    \right]
-    =
-    - \frac{1}{2 \lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}.
-  \end{align*}
-  %
-  Hence the second-order bias term is
-  %
-  \begin{align*}
-    \frac{1}{2 \lambda^2}
-    \sum_{j=1}^d
-    \frac{\partial^2 \mu(x)}{\partial x_j^2}
-    + \frac{1}{2 \lambda^2}
-    \frac{1}{f(x)}
-    \sum_{j=1}^{d}
-    \frac{\partial \mu(x)}{\partial x_j}
-    \frac{\partial f(x)}{\partial x_j}.
-  \end{align*}
-
-  \proofparagraph{calculating the bias if the data is uniformly distributed}
-
-  If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$ and
-  the bias expansion from \eqref{eq:mondrian_app_bias} becomes
-  %
-  \begin{align*}
-    \sum_{|\nu'|=1}^{\flbeta}
-    \lambda^{- |\nu'|}
-    \frac{\partial^{\nu'} \mu(x)}{\nu' !}
-    \prod_{j=1}^d
-    \E \left[
-      \frac{E_{2j}^{\nu'_j + 1}
-      + (-1)^{\nu'_j} E_{1j}^{\nu'_j + 1}}
-      {(\nu'_j + 1) (E_{2j} + E_{1j})}
-    \right].
-  \end{align*}
-  %
-  This is zero if any $\nu_j'$ is odd,
-  so we group these terms based on the exponent of $\lambda$ to see
-  %
-  \begin{align*}
-    \frac{B_r(x)}{\lambda^{2r}}
-    &=
-    \frac{1}{\lambda^{2r}}
-    \sum_{|\nu|=r}
-    \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !}
-    \prod_{j=1}^d
-    \frac{1}{2\nu_j + 1}
-    \E \left[
-      \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}}
-      {E_{2j} + E_{1j}}
-    \right].
-  \end{align*}
-  %
-  Since $\int_0^\infty \frac{e^{-t}}{a+t} \diff t = e^a \Gamma(0,a)$
-  and $\int_0^\infty s^a \Gamma(0, a) \diff s = \frac{a!}{a+1}$,
-  with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$
-  the upper incomplete gamma function,
-  the expectation is easily calculated as
-  %
-  \begin{align*}
-    \E \left[
-      \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}}
-      {E_{2j} + E_{1j}}
-    \right]
-    &=
-    2
-    \int_{0}^{\infty}
-    s^{2\nu_j + 1}
-    e^{-s}
-    \int_{0}^{\infty}
-    \frac{e^{-t}}
-    {s + t}
-    \diff t
-    \diff s \\
-    &=
-    2 \int_{0}^{\infty}
-    s^{2\nu_j + 1}
-    \Gamma(0, s)
-    \diff s
-    =
-    \frac{(2 \nu_j + 1)!}{\nu_j + 1},
-  \end{align*}
-  %
-  so finally
-  %
-  \begin{align*}
-    \frac{B_r(x)}{\lambda^{2r}}
-    &=
-    \frac{1}{\lambda^{2r}}
-    \sum_{|\nu|=r}
-    \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !}
-    \prod_{j=1}^d
-    \frac{1}{2\nu_j + 1}
-    \frac{(2 \nu_j + 1)!}{\nu_j + 1}
-    =
-    \frac{1}{\lambda^{2r}}
-    \sum_{|\nu|=r}
-    \partial^{2 \nu} \mu(x)
-    \prod_{j=1}^d
-    \frac{1}{\nu_j + 1}.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation}]
-  This follows from the debiased version in
-  Theorem~\ref{thm:mondrian_variance_estimation_debiased}
-  with $J=0$, $a_0 = 1$, and $\omega_0 = 1$.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_confidence}]
-  %
-  By Theorem~\ref{thm:mondrian_bias}
-  and Theorem~\ref{thm:mondrian_variance_estimation},
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu(x) - \mu(x)}{\hat \Sigma(x)^{1/2}}
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]}
-    {\hat \Sigma(x)^{1/2}}
-    + \sqrt{\frac{n}{\lambda^d}}
-    \frac{\E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x)}
-    {\hat \Sigma(x)^{1/2}} \\
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]}
-    {\hat \Sigma(x)^{1/2}}
-    + \sqrt{\frac{n}{\lambda^d}} \,
-    O_\P \left(
-      \frac{1}{\lambda^{\beta \wedge 2}}
-      + \frac{1}{\lambda \sqrt B}
-      + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{align*}
-  %
-  The first term now converges weakly to $\cN(0,1)$ by
-  Slutsky's theorem, Theorem~\ref{thm:mondrian_clt},
-  and Theorem~\ref{thm:mondrian_variance_estimation},
-  while the second term is $o_\P(1)$ by assumption.
-  Validity of the confidence interval follows immediately.
-  %
-\end{proof}
-
-\subsection{Debiased Mondrian random forests}
-
-We give rigorous proofs of the central limit theorem,
-bias characterization, variance estimation,
-confidence interval validity, and minimax optimality
-results for the debiased Mondrian random forest estimator.
-
-\begin{proof}[Theorem~\ref{thm:mondrian_clt_debiased}]
-
-  We use the martingale central limit theorem given by
-  \citet[Theorem~3.2]{hall1980martingale}.
-  For each $1 \leq i \leq n$ define
-  $\cH_{n i}$ to be the filtration
-  generated by $\bT$, $\bX$, and
-  $(\varepsilon_j : 1 \leq j \leq i)$,
-  noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$
-  because $B$ increases weakly as $n$ increases.
-  Let $\I_{i b r}(x) = \I\{X_i \in T_{b r}(x)\}$
-  where $T_{b r}(x)$ is the cell containing $x$ in tree $b$
-  used to construct $\hat \mu_r(x)$,
-  and similarly let $N_{b r}(x) = \sum_{i=1}^n \I_{i b r}(x)$
-  and $\I_{b r}(x) = \I\{N_{b r}(x) \geq 1\}$.
-  Define the $\cH_{n i}$-measurable and square integrable
-  variables
-  %
-  \begin{align*}
-    S_i(x)
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    \omega_r
-    \frac{1}{B} \sum_{b=1}^B
-    \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)},
-  \end{align*}
-  %
-  which satisfy the martingale
-  difference property
-  $\E [ S_i(x) \mid \cH_{n i} ] = 0$.
-  Further,
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \big(
-      \hat\mu_\rd(x)
-      - \E\left[
-        \hat\mu_\rd(x) \mid \bX, \bT
-      \right]
-    \big)
-    = \sum_{i=1}^n S_i(x).
-  \end{align*}
-  %
-  By \citet[Theorem~3.2]{hall1980martingale}
-  it suffices to check that
-  %
-  \begin{inlineroman}
-    \item $\max_i |S_i(x)| \to 0$ in probability,%
-      \label{it:mondrian_app_hall_prob}
-    \item $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and%
-      \label{it:mondrian_app_hall_exp}
-    \item $\sum_i S_i(x)^2 \to \Sigma_\rd(x)$ in probability.
-      \label{it:mondrian_app_hall_var}
-  \end{inlineroman}
-
-  \proofparagraph{checking condition \ref{it:mondrian_app_hall_prob}}
-  %
-  Since $J$ is fixed and
-  $\E[|\varepsilon_i|^3 \mid X_i]$ is bounded,
-  by Jensen's inequality and
-  Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
-  %
-  \begin{align*}
-    \E\left[\max_{1 \leq i \leq n} |S_i(x)| \right]
-    &=
-    \E\left[\max_{1 \leq i \leq n}
-      \left|
-      \sqrt{\frac{n}{\lambda^d}}
-      \sum_{r=0}^{J}
-      \omega_r
-      \frac{1}{B} \sum_{b=1}^B
-      \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
-      \right|
-    \right] \\
-    &\leq
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    |\omega_r|
-    \frac{1}{B}
-    \E\left[\max_{1 \leq i \leq n}
-      \left|
-      \sum_{b=1}^B
-      \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
-      \right|
-    \right] \\
-    &\leq
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    |\omega_r|
-    \frac{1}{B}
-    \E\left[
-      \sum_{i=1}^{n}
-      \left(
-        \sum_{b=1}^B
-        \frac{\I_{i b r}(x) |\varepsilon_i|} {N_{b r}(x)}
-      \right)^3
-    \right]^{1/3} \\
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    |\omega_r|
-    \frac{1}{B}
-    \E\left[
-      \sum_{i=1}^{n}
-      |\varepsilon_i|^3
-      \sum_{b=1}^B
-      \sum_{b'=1}^B
-      \sum_{b''=1}^B
-      \frac{\I_{i b r}(x) } {N_{b r}(x)}
-      \frac{\I_{i b' r}(x) } {N_{b' r}(x)}
-      \frac{\I_{i b'' r}(x) } {N_{b'' r}(x)}
-    \right]^{1/3} \\
-    &\lesssim
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    |\omega_r|
-    \frac{1}{B^{2/3}}
-    \E\left[
-      \sum_{b=1}^B
-      \sum_{b'=1}^B
-      \frac{\I_{b r}(x)} {N_{b r}(x)}
-      \frac{\I_{b' r}(x)} {N_{b' r}(x)}
-    \right]^{1/3} \\
-    &\lesssim
-    \sqrt{\frac{n}{\lambda^d}}
-    \sum_{r=0}^{J}
-    |\omega_r|
-    \frac{1}{B^{2/3}}
-    \left(
-      B^2 \frac{a_r^{2d} \lambda^{2d}}{n^2}
-      + B \frac{a_r^{2d} \lambda^{2d} \log n}{n^2}
-    \right)^{1/3} \\
-    &\lesssim
-    \left( \frac{\lambda^d}{n} \right)^{1/6}
-    + \left( \frac{\lambda^d}{n} \right)^{1/6}
-    \left( \frac{\log n}{B} \right)^{1/3}
-    \to 0.
-  \end{align*}
-
-  \proofparagraph{checking condition \ref{it:mondrian_app_hall_exp}}
-  %
-  Since $\E[\varepsilon_i^2 \mid X_i]$ is bounded
-  and by Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
-  %
-  \begin{align*}
-    \E\left[\max_{1 \leq i \leq n} S_i(x)^2 \right]
-    &=
-    \E\left[
-      \max_{1 \leq i \leq n}
-      \left(
-        \sqrt{\frac{n}{\lambda^d}}
-        \sum_{r=0}^{J}
-        \omega_r
-        \frac{1}{B} \sum_{b=1}^B
-        \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
-      \right)^2
-    \right] \\
-    &\leq
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    (J+1)^2
-    \max_{0 \leq r \leq J}
-    \omega_r^2
-    \,\E\left[
-      \sum_{i=1}^{n}
-      \sum_{b=1}^B
-      \sum_{b'=1}^B
-      \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r}(x)}
-    \right] \\
-    &\lesssim
-    \frac{n}{\lambda^d}
-    \max_{0 \leq r \leq J}
-    \E\left[
-      \frac{\I_{b r}(x)}{N_{b r}(x)}
-    \right]
-    \lesssim
-    \frac{n}{\lambda^d}
-    \max_{0 \leq r \leq J}
-    \frac{a_r^d \lambda^d}{n}
-    \lesssim 1.
-  \end{align*}
-
-  \proofparagraph{checking condition \ref{it:mondrian_app_hall_var}}
-
-  Next, we have
-  %
-  \begin{align}
-    \label{eq:mondrian_app_clt_condition_sum}
-    \sum_{i=1}^n
-    S_i(x)^2
-    &=
-    \sum_{i=1}^n
-    \left(
-      \sqrt{\frac{n}{\lambda^d}}
-      \sum_{r=0}^{J}
-      \omega_r
-      \frac{1}{B} \sum_{b=1}^B
-      \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
-    \right)^2 \\
-    &=
-    \nonumber
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    \sum_{i=1}^n
-    \sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \sum_{b=1}^B
-    \sum_{b'=1}^B
-    \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-    {N_{b r}(x) N_{b' r'}(x)} \\
-    \nonumber
-    &=
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    \sum_{i=1}^n
-    \sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \sum_{b=1}^B
-    \left(
-      \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b r'}(x)}
-      + \sum_{b' \neq b}
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right).
-  \end{align}
-  %
-  By boundedness of $\E[\varepsilon_i^2 \mid X_i]$
-  and Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
-  the first term in \eqref{eq:mondrian_app_clt_condition_sum}
-  vanishes as
-  %
-  \begin{align*}
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    \sum_{i=1}^n
-    \sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \sum_{b=1}^B
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b r'}(x)}
-    \right]
-    &\lesssim
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    \max_{0 \leq r \leq J}
-    \sum_{b=1}^B
-    \E \left[
-      \frac{\I_{b r}(x)}{N_{b r}(x)}
-    \right]
-    \lesssim
-    \frac{1}{B}
-    \to 0.
-  \end{align*}
-  %
-  For the second term in \eqref{eq:mondrian_app_clt_condition_sum},
-  the law of total variance gives
-  %
-  \begin{align}
-    \nonumber
-    &\Var \left[
-      \frac{n}{\lambda^d}
-      \frac{1}{B^2}
-      \sum_{i=1}^n
-      \sum_{r=0}^{J}
-      \sum_{r'=0}^{J}
-      \omega_r
-      \omega_{r'}
-      \sum_{b=1}^B
-      \sum_{b' \neq b}
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    \nonumber
-    &\quad\leq
-    (J+1)^4
-    \max_{0 \leq r, r' \leq J}
-    \omega_r
-    \omega_{r'}
-    \Var \left[
-      \frac{n}{\lambda^d}
-      \frac{1}{B^2}
-      \sum_{i=1}^n
-      \sum_{b=1}^B
-      \sum_{b' \neq b}
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    \nonumber
-    &\quad\lesssim
-    \max_{0 \leq r, r' \leq J}
-    \E \left[
-      \Var \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    \label{eq:mondrian_app_total_variance}
-    &\qquad+
-    \max_{0 \leq r, r' \leq J}
-    \Var \left[
-      \E \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right]
-  \end{align}
-  %
-  For the first term in \eqref{eq:mondrian_app_total_variance},
-  %
-  \begin{align*}
-    &\E \left[
-      \Var \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    &\quad=
-    \frac{n^2}{\lambda^{2d}}
-    \frac{1}{B^4}
-    \sum_{i=1}^n
-    \sum_{j=1}^n
-    \sum_{b=1}^B
-    \sum_{b' \neq b}
-    \sum_{\tilde b=1}^B
-    \sum_{\tilde b' \neq \tilde b}
-    \E \Bigg[
-      \varepsilon_i^2
-      \varepsilon_j^2
-      \left(
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) }
-        {N_{b r}(x) N_{b' r'}(x)}
-        - \E
-        \left[
-          \frac{\I_{i b r}(x) \I_{i b' r'}(x) }
-          {N_{b r}(x) N_{b' r'}(x)}
-          \Bigm| \bX
-        \right]
-      \right) \\
-      &\qquad\quad
-      \times
-      \left(
-        \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) }
-        {N_{\tilde b r}(x) N_{ \tilde b' r'}(x)}
-        - \E
-        \left[
-          \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) }
-          {N_{\tilde b r}(x) N_{\tilde b' r'}(x)}
-          \Bigm| \bX
-        \right]
-      \right)
-    \Bigg].
-  \end{align*}
-  %
-  Since $T_{b r}$ is independent of $T_{b' r'}$ given
-  $\bX, \bY$, the summands are zero
-  whenever $\big|\{b, b', \tilde b, \tilde b'\}\big| = 4$.
-  Since $\E[ \varepsilon_i^2 \mid X_i]$ is bounded
-  and by the Cauchy--Schwarz inequality
-  and Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
-  %
-  \begin{align*}
-    &\E \left[
-      \Var \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \frac{1}{B^3}
-    \sum_{b=1}^B
-    \sum_{b' \neq b}
-    \E \left[
-      \left(
-        \sum_{i=1}^n
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) }
-        {N_{b r}(x) N_{b' r'}(x)}
-      \right)^2
-    \right]
-    \lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \frac{1}{B}
-    \E \left[
-      \frac{\I_{b r}(x)}{N_{b r}(x)}
-      \frac{\I_{b' r'}(x)}{N_{b' r'}(x)}
-    \right]
-    \lesssim
-    \frac{1}{B}
-    \to 0.
-  \end{align*}
-  %
-  For the second term in \eqref{eq:mondrian_app_total_variance},
-  the random variable inside the variance is a nonlinear
-  function of the i.i.d.\ variables $(X_i, \varepsilon_i)$,
-  so we apply the Efron--Stein inequality
-  \citep{efron1981jackknife}.
-  Let $(\tilde X_{i j}, \tilde Y_{i j}) = (X_i, Y_i)$
-  if $i \neq j$ and be an
-  independent copy of $(X_j, Y_j)$,
-  denoted $(\tilde X_j, \tilde Y_j)$, if $i = j$,
-  and define $\tilde \varepsilon_{i j} = \tilde Y_{i j} - \mu(\tilde X_{i j})$.
-  Write
-  $\tilde \I_{i j b r}(x) = \I \big\{ \tilde X_{i j} \in T_{b r}(x) \big\}$
-  and
-  $\tilde \I_{j b r}(x) = \I \big\{ \tilde X_{j} \in T_{b r}(x) \big\}$,
-  and also
-  $\tilde N_{j b r}(x) = \sum_{i=1}^{n} \tilde \I_{i j b r}(x)$.
-  We use the leave-one-out notation
-  $N_{-j b r}(x) = \sum_{i \neq j} \I_{i b r}(x)$
-  and also write
-  $N_{-j b r \cap b' r'} = \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x)$.
-  Since $\E[ \varepsilon_i^4 \mid X_i]$ is bounded,
-  %
-  \begin{align*}
-    &\Var \left[
-      \E \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    &\quad\leq
-    \Var \left[
-      \E \left[
-        \frac{n}{\lambda^d}
-        \sum_{i=1}^n
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    &\quad\leq
-    \frac{1}{2}
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \sum_{i=1}^n
-        \left(
-          \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2}
-          {N_{b r}(x) N_{b' r'}(x)}
-          - \frac{\tilde \I_{i j b r}(x) \tilde \I_{i j b' r'}(x)
-          \tilde \varepsilon_{i j}^2}
-          {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
-        \right)
-      \right)^2
-    \right] \\
-    &\quad\leq
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \left|
-        \frac{1}
-        {N_{b }(x) N_{b' r'}(x)}
-        - \frac{1}
-        {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
-        \right|
-        \sum_{i \neq j}
-        \I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2
-      \right)^2
-    \right] \\
-    &\qquad+
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      \left(
-        \left(
-          \frac{\I_{j b r}(x) \I_{j b' r'}(x) \varepsilon_j^2}
-          {N_{b r}(x) N_{b' r'}(x)}
-          - \frac{\tilde \I_{j b r}(x) \tilde \I_{j b' r'}(x)
-          \tilde \varepsilon_j^2}
-          {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
-        \right)
-      \right)^2
-    \right] \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      N_{-j b r \cap b' r}(x)^2
-      \left|
-      \frac{1}
-      {N_{b r}(x) N_{b' r'}(x)}
-      - \frac{1}
-      {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
-      \right|^2
-      + \frac{\I_{j b r}(x) \I_{j b' r'}(x)}
-      {N_{b r}(x)^2 N_{b' r'}(x)^2}
-    \right].
-  \end{align*}
-  %
-  For the first term in the above display, note that
-  %
-  \begin{align*}
-    &\left|
-    \frac{1}{N_{b r}(x) N_{b' r'}(x)}
-    - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
-    \right| \\
-    &\quad\leq
-    \frac{1}{N_{b r}(x)}
-    \left|
-    \frac{1} {N_{b' r'}(x)} - \frac{1} {\tilde N_{j b' r'}(x)}
-    \right|
-    + \frac{1}{\tilde N_{j b' r'}(x)}
-    \left|
-    \frac{1} {N_{b r}(x)} - \frac{1} {\tilde N_{j b r}(x)}
-    \right| \\
-    &\quad\leq
-    \frac{1}{N_{-j b r}(x)}
-    \frac{1} {N_{-j b' r'}(x)^2}
-    + \frac{1}{N_{-j b' r'}(x)}
-    \frac{1} {N_{-j b r}(x)^2}
-  \end{align*}
-  %
-  since $|N_{b r}(x) - \tilde N_{j b r}(x)| \leq 1$
-  and $|N_{b' r'}(x) - \tilde N_{j b' r'}(x)| \leq 1$.
-  Further, these terms are non-zero only on the events
-  $\{ X_j \in T_{b r}(x) \} \cup \{ \tilde X_j \in T_{b r}(x) \}$
-  and $\{ X_j \in T_{b' r'}(x) \} \cup \{ \tilde X_j \in T_{b' r'}(x) \}$
-  respectively, so
-  %
-  \begin{align*}
-    &\Var \left[
-      \E \left[
-        \frac{n}{\lambda^d}
-        \frac{1}{B^2}
-        \sum_{i=1}^n
-        \sum_{b=1}^B
-        \sum_{b' \neq b}
-        \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-        {N_{b r}(x) N_{b' r'}(x)}
-        \Bigm| \bX, \bY
-      \right]
-    \right] \\
-    &\, \lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      \frac{\I_{j b' r'}(x) + \tilde \I_{j b' r'}(x)}{N_{-j b r}(x)^2}
-      \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b' r'}(x)^4}
-      \right. \\
-      &\left.
-      \qquad+
-      \frac{\I_{j b r}(x) + \tilde \I_{j b r}(x)}{N_{-j b' r'}(x)^2}
-      \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b r}(x)^4}
-      +
-      \frac{\I_{j b r}(x) \I_{j b' r'}(x)}
-      {N_{b r}(x)^2 N_{b' r'}(x)^2}
-    \right] \\
-    &\, \lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \sum_{j=1}^{n}
-    \E \left[
-      \frac{\I_{j b r}(x) \I_{b r}(x) \I_{b' r'}(x)}
-      {N_{b r}(x)^2 N_{b' r'}(x)^2}
-    \right]
-    \lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \E \left[
-      \frac{\I_{b r}(x) \I_{b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)^2}
-    \right] \\
-    &\lesssim
-    \frac{n^2}{\lambda^{2d}}
-    \frac{\lambda^d}{n}
-    \frac{\lambda^{2d} \log n}{n^2}
-    \lesssim
-    \frac{\lambda^d \log n}{n}
-    \to 0,
-  \end{align*}
-  %
-  where we used Lemma~\ref{lem:mondrian_app_simple_moment_denominator}.
-  So
-  $\sum_{i=1}^{n} S_i(x)^2 - n \,\E \left[ S_i(x)^2 \right]
-  = O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right)
-  = o_\P(1)$.
-
-  \proofparagraph{calculating the limiting variance}
-  %
-  Thus by \citet[Theorem~3.2]{hall1980martingale}
-  we conclude that
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \big(
-      \hat\mu_\rd(x)
-      - \E\left[
-        \hat\mu_\rd(x) \mid \bX, \bT
-      \right]
-    \big)
-    &\rightsquigarrow
-    \cN\big(0, \Sigma_\rd(x)\big)
-  \end{align*}
-  %
-  as $n \to \infty$, assuming that the limit
-  %
-  \begin{align*}
-    \Sigma_\rd(x)
-    &=
-    \lim_{n \to \infty}
-    \sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-  \end{align*}
-  %
-  exists. Now we verify this and calculate the limit.
-  Since $J$ is fixed, it suffices to find
-  %
-  \begin{align*}
-    \lim_{n \to \infty}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-  \end{align*}
-  %
-  for each $0 \leq r, r' \leq J$.
-  Firstly, note that
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \sigma^2(X_i)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    &=
-    \frac{n^2}{\lambda^d}
-    \sigma^2(x)
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    &\quad+
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)
-      \big(\sigma^2(X_i) - \sigma^2(x) \big)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right].
-  \end{align*}
-  %
-  Since $\sigma^2$ is Lipschitz and
-  $\P \left(\max_{1 \leq l \leq d}
-  |T_b(x)_l| \geq t/\lambda \right) \leq 2d e^{-t/2}$
-  by Lemma~\ref{lem:mondrian_app_largest_cell},
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)
-      \big|\sigma^2(X_i) - \sigma^2(x) \big|}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &\leq
-    2de^{-t/2}
-    \frac{n^2}{\lambda^d}
-    + \frac{n^2}{\lambda^d}
-    \frac{t}{\lambda}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    &\lesssim
-    \frac{n^2}{\lambda^d}
-    \frac{\log n}{\lambda}
-    \frac{\lambda^d}{n^2}
-    \lesssim
-    \frac{\log n}{\lambda},
-  \end{align*}
-  %
-  by Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
-  where we set $t = 4 \log n$.
-  Therefore
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \sigma^2(x)
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    + O \left( \frac{\log n}{\lambda} \right).
-  \end{align*}
-  %
-  Next, by conditioning on
-  $T_{b r}$, $T_{b' r'}$, $N_{-i b r}(x)$, and $N_{-i b' r'}(x)$,
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    = \E \left[
-      \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} f(\xi) \diff \xi}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right] \\
-    &\quad= f(x) \,
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right]
-    +
-    \E \left[
-      \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)}
-      (f(\xi) - f(x)) \diff \xi}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right] \\
-    &\quad=
-    f(x) \,
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right]
-    + O \left(
-      \frac{\lambda^d}{n^2}
-      \frac{(\log n)^{d+1}}{\lambda}
-    \right)
-  \end{align*}
-  %
-  arguing using Lemma~\ref{lem:mondrian_app_largest_cell},
-  the Lipschitz property of $f(x)$,
-  and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \! \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \sigma^2(x)
-    f(x)
-    \frac{n^2}{\lambda^d}
-    \E \! \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right]
-    \! + O \! \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-    \right).
-  \end{align*}
-  %
-  Now we apply the binomial result in
-  Lemma~\ref{lem:mondrian_app_binomial_expectation}
-  to approximate the expectation. With
-  $N_{-i b' r' \setminus b r}(x) =
-  \sum_{j \neq i} \I\{X_j \in T_{b' r'}(x) \setminus T_{b r}(x)\}$,
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-    \right]
-    = \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {N_{-i b r}(x)+1}
-      \right. \\
-      &\qquad\left.
-      \times \,
-      \E \left[
-        \frac{1}
-        {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1}
-        \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x)
-      \right]
-    \right].
-  \end{align*}
-  %
-  Now conditional on
-  $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$,
-  %
-  \begin{align*}
-    N_{-i b' r' \setminus b r}(x)
-    &\sim \Bin\left(
-      n - 1 - N_{-i b r}(x), \
-      \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi}
-      {1 - \int_{T_{b r}(x)}
-      f(\xi) \diff \xi}
-    \right).
-  \end{align*}
-  %
-  We bound these parameters above and below.
-  Firstly, by Lemma~\ref{lem:mondrian_app_active_data} with $B=1$,
-  %
-  \begin{align*}
-    \P \left( N_{-i b r}(x) >
-      t^{d+1}
-      \frac{n}{\lambda^d}
-    \right)
-    &\leq
-    4 d e^{- t / (4 \|f\|_\infty(1 + 1/a_r))}
-    \leq
-    e^{- t / C}
-  \end{align*}
-  %
-  for some $C > 0$ and sufficiently large $t$.
-  Next, if $f$ is $L$-Lipschitz in $\ell^2$,
-  by Lemma~\ref{lem:mondrian_app_largest_cell},
-  %
-  \begin{align*}
-    &\P \left(
-      \left|
-      \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi}
-      {1 - \int_{T_{b r}(x)} f(\xi)
-      \diff \xi}
-      - f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|
-      \right|
-      > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{\lambda}
-    \right) \\
-    &\quad\leq
-    \P \left(
-      \int_{T_{b' r'}(x) \setminus T_{b r}(x)}
-      \left| f(\xi) - f(x) \right|
-      \diff \xi
-      > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2 \lambda}
-    \right) \\
-    &\qquad+
-    \P \left(
-      \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi
-      \cdot \int_{T_{b r}(x)} f(\xi) \diff \xi}
-      {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi}
-      > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
-    \right) \\
-    &\quad\leq
-    \P \left(
-      L d\,
-      |T_{b' r'}(x) \setminus T_{b r}(x)|
-      \max_{1 \leq j \leq d} |T_{b' r'}(x)_j|
-      > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
-    \right) \\
-    &\qquad+
-    \P \left(
-      \|f\|_\infty
-      \,|T_{b' r'}(x) \setminus T_{b r}(x)|
-      \frac{\|f\|_\infty |T_{b r}(x)|}
-      {1 - \|f\|_\infty |T_{b r}(x)|}
-      > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
-    \right) \\
-    &\quad\leq
-    \P \left(
-      \max_{1 \leq j \leq d} |T_{b' r'}(x)_j|
-      > \frac{t}{2\lambda L d}
-    \right)
-    +\P \left(
-      |T_{b r}(x)|
-      > \frac{t}{4\lambda \|f\|_\infty^2}
-    \right) \\
-    &\quad\leq
-    2 d e^{-t a_r /(4L d)}
-    + 2 d e^{-t a_r / (8 \|f\|_\infty^2)}
-    \leq e^{-t/C},
-  \end{align*}
-  %
-  for large $t$,
-  increasing $C$ as necessary.
-  Thus with probability at least $1 - e^{-t/C}$,
-  increasing $C$,
-  %
-  \begin{align*}
-    N_{-i b' r' \setminus b r}(x)
-    &\leq \Bin\left(
-      n, \,
-      |T_{b' r'}(x) \setminus T_{b r}(x)|
-      \left( f(x) + \frac{t}{\lambda} \right)
-    \right) \\
-    N_{-i b' r' \setminus b r}(x)
-    &\geq
-    \Bin\left(
-      n
-      \left( 1 - \frac{t^{d+1}}{\lambda^d}
-      - \frac{1}{n} \right), \,
-      |T_{b' r'}(x) \setminus T_{b r}(x)|
-      \left( f(x) - \frac{t}{\lambda} \right)
-    \right).
-  \end{align*}
-  %
-  So by Lemma~\ref{lem:mondrian_app_binomial_expectation} conditionally on
-  $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$,
-  we have with probability at least $1 - e^{-t/C}$ that
-  %
-  \begin{align*}
-    &\left|
-    \E \left[
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1}
-      \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x)
-    \right]
-    \right.
-    \\
-    &\left.
-    \qquad-
-    \frac{1}
-    {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
-    \right| \\
-    &\quad\lesssim
-    \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
-    {\left(N_{-i b' r' \cap b r}(x)
-    + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}.
-  \end{align*}
-  %
-  Therefore, by the same approach as the proof of
-  Lemma~\ref{lem:mondrian_app_moment_denominator},
-  taking $t = 3 C \log n$,
-  %
-  \begin{align*}
-    &
-    \left|
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
-      \right.\right. \\
-      &\left.\left.
-      \qquad -
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(N_{-i b r}(x)+1)
-        (N_{-i b' r' \cap b r}(x)+n f(x)
-      |T_{b' r'}(x) \setminus T_{b r}(x)|+1)}
-    \right]
-    \right| \\
-    &\quad\lesssim
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1}
-      \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
-      {\left(N_{-i b' r' \cap b r}(x)
-      + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}
-    \right]
-    +
-    e^{-t/C} \\
-    &\quad\lesssim
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {n |T_{b r}(x)|+1}
-      \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
-      {(n |T_{b' r'}(x)| + 1)^2}
-    \right]
-    + e^{-t/C} \\
-    &\quad\lesssim
-    \E \left[
-      \frac{1}{n}
-      \frac{1}
-      {(n |T_{b' r'}(x)| + 1)^2}
-      + \frac{1}{n}
-      \frac{t / \lambda}
-      {n |T_{b' r'}(x)| + 1}
-    \right]
-    + e^{-t/C} \\
-    &\quad\lesssim
-    \frac{\lambda^{2d} \log n}{n^3}
-    + \frac{\log n}{n \lambda}
-    \frac{\lambda^d}{n}
-    \lesssim
-    \frac{\lambda^d}{n^2}
-    \left(
-      \frac{\lambda^{d} \log n}{n}
-      + \frac{\log n}{\lambda}
-    \right).
-  \end{align*}
-  %
-  Now apply the same argument to the other
-  term in the expectation, to see that
-  %
-  \begin{align*}
-    &\left|
-    \E \left[
-      \frac{1}
-      {N_{-i b r \cap b' r'}(x)+N_{-i b r \setminus b' r'}(x)+1}
-      \Bigm| \bT, N_{-i b r \cap b' r'}(x), N_{-i b' r' \setminus b r}(x)
-    \right]
-    \right. \\
-    &\left.
-    \qquad-
-    \frac{1}
-    {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
-    \right| \\
-    &\quad\lesssim
-    \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|}
-    {\left(N_{-i b r \cap b' r'}(x)
-    + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}.
-  \end{align*}
-  %
-  with probability at least $1 - e^{-t/C}$,
-  and so likewise again with $t = 3 C \log n$,
-  %
-  \begin{align*}
-    &\frac{n^2}{\lambda^d}
-    \left|
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1}
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
-    \right]
-    \right.
-    \\
-    &\left.
-    \quad-
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
-      \right.\right. \\
-      &\qquad\qquad\left.\left.
-      \times
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
-    \right]
-    \right| \\
-    &\lesssim
-    \frac{n^2}{\lambda^d} \,
-    \E \left[
-      \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|}
-      {\left(N_{-i b r \cap b' r'}(x)
-      + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}
-      \right. \\
-      &\qquad\qquad\left.
-      \times
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
-    \right]
-    + \frac{n^2}{\lambda^d}
-    e^{-t/C} \\
-    &\lesssim
-    \frac{\lambda^d \log n}{n}
-    + \frac{\log n}{\lambda}.
-  \end{align*}
-  %
-  Thus far we have proven that
-  %
-  \begin{align*}
-    &\frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    = \sigma^2(x)
-    f(x)
-    \frac{n^2}{\lambda^d} \\
-    &\quad\times
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
-      \right. \\
-      &\left.
-      \qquad\qquad
-      \times
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
-    \right] \\
-    &\quad+
-    O \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{\lambda^d \log n}{n}
-    \right).
-  \end{align*}
-  %
-  We remove the $N_{-i b r \cap b' r'}(x)$ terms.
-  With probability at least $1 - e^{-t/C}$, conditional on $\bT$,
-  %
-  \begin{align*}
-    N_{-i b r \cap b' r'}(x)
-    &\leq \Bin\left(
-      n, \,
-      |T_{b r}(x) \cap T_{b' r'}(x)|
-      \left( f(x) + \frac{t}{\lambda} \right)
-    \right), \\
-    N_{-i b r \cap b' r'}(x)
-    &\geq
-    \Bin\left(
-      n
-      \left( 1 - \frac{t^{d+1}}{\lambda^d}
-      - \frac{1}{n} \right), \,
-      |T_{b r}(x) \cap T_{b' r'}(x)|
-      \left( f(x) - \frac{t}{\lambda} \right)
-    \right).
-  \end{align*}
-  %
-  Therefore, by Lemma~\ref{lem:mondrian_app_binomial_expectation}
-  applied conditionally on $\bT$,
-  with probability at least $1 - e^{-t/C}$,
-  %
-  \begin{align*}
-    &
-    \left|
-    \E \! \left[
-      \frac{1}
-      {N_{-i b r \cap b' r'}(x)
-      + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1}
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)
-      + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1}
-      \! \Bigm| \! \bT
-    \right]
-    \right.
-    \\
-    &\left.
-    \qquad-
-    \frac{1}
-    {n f(x) |T_{b r}(x)|+1}
-    \frac{1}
-    {n f(x) |T_{b' r'}(x)|+1}
-    \right| \\
-    &\quad\lesssim
-    \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
-    {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)}
-    \left(
-      \frac{1}{n |T_{b r}(x)| + 1}
-      + \frac{1}{n |T_{b' r'}(x)| + 1}
-    \right).
-  \end{align*}
-  %
-  Now by Lemma~\ref{lem:mondrian_app_moment_cell},
-  with $t = 3 C \log n$,
-  %
-  \begin{align*}
-    &\frac{n^2}{\lambda^d}
-    \left|
-    \E \! \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {N_{-i b r \cap b' r'}(x)
-      + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1}
-      \frac{1}
-      {N_{-i b' r' \cap b r}(x)
-      + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1}
-    \right]
-    \right. \\
-    &\left.
-    \qquad-
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {n f(x) |T_{b r}(x)|+1}
-      \frac{1}
-      {n f(x) |T_{b' r'}(x)|+1}
-    \right]
-    \right| \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      |T_{b r}(x) \cap T_{b' r'}(x)|
-      \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)}
-      \frac{1}{n |T_{b r}(x)| + 1}
-      + \frac{1}{n |T_{b' r'}(x)| + 1}
-    \right] \\
-    &\qquad+
-    \frac{n^2}{\lambda^d}
-    e^{-t/C} \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^d}
-    \frac{1}{n^3}
-    \E \left[
-      \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
-      {|T_{b r}(x)| |T_{b' r'}(x)|}
-    \right]
-    + \frac{n^2}{\lambda^d}
-    e^{-t/C} \\
-    &\quad\lesssim
-    \frac{1}{n \lambda^d}
-    \E \left[
-      \frac{1}{|T_{b r}(x)| |T_{b' r'}(x)|}
-    \right]
-    + \frac{t}{\lambda^{d+1}}
-    \E \left[
-      \frac{1}{|T_{b r}(x)|}
-    \right]
-    + \frac{n^2}{\lambda^d}
-    e^{-t/C} \\
-    &\quad\lesssim
-    \frac{\lambda^d}{n}
-    + \frac{\log n}{\lambda}.
-  \end{align*}
-  %
-  This allows us to deduce that
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \sigma^2(x)
-    f(x)
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(n f(x) |T_{b r}(x)|+1)(n f(x) |T_{b' r'}(x)|+1)}
-    \right] \\
-    &\quad+
-    O \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{\lambda^d \log n}{n}
-    \right).
-  \end{align*}
-  %
-  Now that we have reduced the limiting variance to an expression
-  only involving the sizes of Mondrian cells,
-  we can exploit their exact distribution to compute this expectation.
-  Recall from \citet[Proposition~1]{mourtada2020minimax}
-  that we can write
-  %
-  \begin{align*}
-    |T_{b r}(x)|
-    &= \prod_{j=1}^{d}
-    \left(
-      \frac{E_{1j}}{a_r \lambda} \wedge x_j
-      + \frac{E_{2j}}{a_r \lambda} \wedge (1 - x_j)
-    \right), \\
-    |T_{b' r'}(x)|
-    &=
-    \prod_{j=1}^{d}
-    \left(
-      \frac{E_{3j}}{a_{r'} \lambda} \wedge x_j
-      + \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j)
-    \right), \\
-    |T_{b r }(x)\cap T_{b' r'}(x)|
-    &= \prod_{j=1}^{d}
-    \left(
-      \frac{E_{1j}}{a_r \lambda} \wedge
-      \frac{E_{3j}}{a_{r'} \lambda}
-      \wedge x_j
-      + \frac{E_{2j}}{a_r \lambda} \wedge
-      \frac{E_{4j}}{a_{r'} \lambda}
-      \wedge (1 - x_j)
-    \right)
-  \end{align*}
-  %
-  where $E_{1j}$, $E_{2j}$, $E_{3j}$, and $E_{4j}$
-  are independent and $\Exp(1)$.
-  Define their non-truncated versions
-  %
-  \begin{align*}
-    |\tilde T_{b r}(x)|
-    &=
-    a_r^{-d}
-    \lambda^{-d}
-    \prod_{j=1}^{d}
-    \left( E_{1j} + E_{2j} \right), \\
-    |\tilde T_{b' r'}(x)|
-    &=
-    a_{r'}^{-d}
-    \lambda^{-d}
-    \prod_{j=1}^{d}
-    \left( E_{3j} + E_{4j} \right), \\
-    |\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|
-    &=
-    \lambda^{-d}
-    \prod_{j=1}^{d}
-    \left(
-      \frac{E_{1j}}{a_r}
-      \wedge
-      \frac{E_{3j}}{a_{r'}}
-      + \frac{E_{2j}}{a_r}
-      \wedge
-      \frac{E_{4j}}{a_{r'}}
-    \right),
-  \end{align*}
-  %
-  and note that
-  %
-  \begin{align*}
-    &\P \left(
-      \big( \tilde T_{b r}(x), \tilde T_{b' r'}(x),
-      \tilde T_{b r}(x) \cap T_{b' r'}(x) \big)
-      \neq
-      \big( T_{b r}(x), T_{b' r'}(x), T_{b r}(x) \cap T_{b' r'}(x) \big)
-    \right) \\
-    &\,\leq
-    \sum_{j=1}^{d}
-    \big(
-      \P(E_{1j} \geq a_r \lambda x_j)
-      + \P(E_{3j} \geq a_{r'} \lambda x_j)
-      + \P(E_{2j} \geq a_r \lambda (1 - x_j))
-      + \P(E_{4j} \geq a_{r'} \lambda (1 - x_j))
-    \big) \\
-    &\,\leq e^{-C \lambda}
-  \end{align*}
-  %
-  for some $C > 0$ and sufficiently large $\lambda$.
-  So by Cauchy--Schwarz and Lemma~\ref{lem:mondrian_app_moment_cell},
-  %
-  \begin{align*}
-    &
-    \frac{n^2}{\lambda^d}
-    \left|
-    \E \left[
-      \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
-      {n f(x) |T_{b r}(x)|+1}
-      \frac{1}
-      {n f(x) |T_{b' r'}(x)|+1}
-    \right]
-    - \E \left[
-      \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
-      {n f(x) |\tilde T_{b r}(x)|+1}
-      \frac{1}
-      {n f(x) |\tilde T_{b' r'}(x)|+1}
-    \right]
-    \right| \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^d}
-    e^{-C \lambda}
-    \lesssim
-    e^{-C \lambda / 2}
-  \end{align*}
-  %
-  as $\log \lambda \gtrsim \log n$.
-  Therefore
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \sigma^2(x)
-    f(x)
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)}
-    \right] \\
-    &\quad+
-    O \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{\lambda^d \log n}{n}
-    \right).
-  \end{align*}
-  %
-  We remove the superfluous units in the denominators.
-  Firstly, by independence of the trees,
-  %
-  \begin{align*}
-    & \frac{n^2}{\lambda^d}
-    \left|
-    \E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)}
-    \right]
-    - \E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)}
-    \right]
-    \right| \\
-    &\quad\lesssim
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {n |\tilde T_{b r}(x)|}
-      \frac{1}
-      {n^2 |\tilde T_{b' r'}(x)|^2}
-    \right]
-    \lesssim
-    \frac{1}{n \lambda^d}
-    \E \left[
-      \frac{1}{|T_{b r}(x)|}
-    \right]
-    \E \left[
-      \frac{1}{|T_{b' r'}(x)|}
-    \right]
-    \lesssim
-    \frac{\lambda^d}{n}.
-  \end{align*}
-  %
-  Secondly, we have in exactly the same manner that
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \left|
-    \E \left[
-      \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
-      {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)}
-    \right]
-    - \E \left[
-      \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
-      {n^2 f(x)^2 |\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
-    \right]
-    \right|
-    &\lesssim
-    \frac{\lambda^d}{n}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right]
-    &=
-    \frac{\sigma^2(x)}{f(x)}
-    \frac{1}{\lambda^d}
-    \E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
-    \right]
-    + O \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{\lambda^d \log n}{n}
-    \right).
-  \end{align*}
-  %
-  It remains to compute this integral.
-  By independence over $1 \leq j \leq d$,
-  %
-  \begin{align*}
-    &\E \left[
-      \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
-      {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
-    \right] \\
-    &\quad=
-    a_r^d a_{r'}^d \lambda^d
-    \prod_{j=1}^d
-    \E \left[
-      \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})
-      + (E_{2j} a_r) \wedge (E_{4j} / a_{r'}) }
-      { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right)}
-    \right] \\
-    &\quad=
-    2^d a_r^d a_{r'}^d \lambda^d
-    \prod_{j=1}^d
-    \E \left[
-      \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})}
-      { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right) }
-    \right] \\
-    &\quad=
-    2^d a_r^d a_{r'}^d \lambda^d
-    \prod_{j=1}^d
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    \frac{ (t_1 / a_r) \wedge (t_3 / a_{r'}) }
-    { \left( t_1 + t_2 \right) \left( t_3 + t_4 \right) }
-    e^{-t_1 - t_2 - t_3 - t_4}
-    \diff t_1
-    \diff t_2
-    \diff t_3
-    \diff t_4 \\
-    &\quad=
-    2^d a_r^d a_{r'}^d \lambda^d
-    \prod_{j=1}^d
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    ((t_1 / a_r) \wedge (t_3 / a_{r'}))
-    e^{-t_1 - t_3} \\
-    &\qquad\times
-    \left(
-      \int_{0}^{\infty}
-      \frac{e^{-t_2}}{t_1 + t_2}
-      \diff t_2
-    \right)
-    \left(
-      \int_{0}^{\infty}
-      \frac{e^{-t_4}}{t_3 + t_4}
-      \diff t_4
-    \right)
-    \diff t_1
-    \diff t_3 \\
-    &\quad=
-    2^d a_r^d a_{r'}^d \lambda^d
-    \prod_{j=1}^d
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    ((t / a_r) \wedge (s / a_{r'}))
-    \Gamma(0, t)
-    \Gamma(0, s)
-    \diff t
-    \diff s,
-  \end{align*}
-  %
-  as $\int_0^\infty \frac{e^{-t}}{a + t} \diff t = e^a \Gamma(0, a)$
-  with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$. Now
-  %
-  \begin{align*}
-    &2
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    ((t / a_r) \wedge (s / a_{r'}))
-    \Gamma(0, t)
-    \Gamma(0, s)
-    \diff t
-    \diff s \\
-    &\quad=
-    \int_0^\infty
-    \Gamma(0, t)
-    \left(
-      \frac{1}{a_{r'}}
-      \int_0^{a_{r'} t / a_r}
-      2 s \Gamma(0, s)
-      \diff{s}
-      +
-      \frac{t}{a_r}
-      \int_{a_{r'} t / a_r}^\infty
-      2 \Gamma(0, s)
-      \diff{s}
-    \right)
-    \diff{t} \\
-    &\quad=
-    \int_0^\infty
-    \Gamma(0, t)
-    \left(
-      \frac{t}{a_r}
-      e^{- \frac{a_{r'}}{a_r}t}
-      - \frac{1}{a_{r'}} e^{- \frac{a_{r'}}{a_r}t}
-      + \frac{1}{a_{r'}}
-      - \frac{a_{r'}}{a_r^2} t^2
-      \Gamma\left(0, \frac{a_{r'}}{a_r} t\right)
-    \right)
-    \diff{t} \\
-    &\quad=
-    \frac{1}{a_r}
-    \int_0^\infty
-    t e^{- \frac{a_{r'}}{a_r} t}
-    \Gamma(0, t)
-    \diff{t}
-    - \frac{1}{a_{r'}}
-    \int_0^\infty
-    e^{- \frac{a_{r'}}{a_r} t}
-    \Gamma(0, t)
-    \diff{t} \\
-    &\qquad+
-    \frac{1}{a_{r'}}
-    \int_0^\infty
-    \Gamma(0, t)
-    \diff{t}
-    -
-    \frac{a_{r'}}{a_r^2}
-    \int_0^\infty
-    t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right)
-    \Gamma(0, t)
-    \diff{t},
-  \end{align*}
-  %
-  since
-  $\int_0^a 2 t \Gamma(0, t) \diff t = a^2 \Gamma(0, a) - a e^{-a} -e^{-a} + 1$
-  and
-  $\int_a^\infty \Gamma(0, t) \diff t = e^{-a} - a \Gamma(0, a)$.
-  Next, we use
-  %
-  $ \int_{0}^{\infty} \Gamma(0, t) \diff t = 1$,
-  $\int_{0}^{\infty} e^{-at} \Gamma(0, t) \diff t
-  = \frac{\log(1+a)}{a}$,
-  $\int_{0}^{\infty} t e^{-at} \Gamma(0, t) \diff t
-  = \frac{\log(1+a)}{a^2} - \frac{1}{a(a+1)}$,
-  and
-  $\int_{0}^{\infty} t^2 \Gamma(0, t) \Gamma(0, at) \diff t
-  = - \frac{2a^2 + a + 2}{3a^2 (a+1)} + \frac{2(a^3 + 1) \log(a+1)}{3a^3}
-  - \frac{2 \log a}{3}$
-  to see
-  %
-  \begin{align*}
-    &2
-    \int_{0}^{\infty}
-    \int_{0}^{\infty}
-    ((t / a_r) \wedge (s / a_{r'}))
-    \Gamma(0, t)
-    \Gamma(0, s)
-    \diff t
-    \diff s \\
-    &\quad=
-    \frac{a_r \log(1+a_{r'} / a_r)}{a_{r'}^2}
-    - \frac{a_r / a_{r'}}{a_r + a_{r'}}
-    - \frac{a_r \log(1 + a_{r'} / a_r)}{a_{r'}^2}
-    + \frac{1}{a_{r'}} \\
-    &\qquad+
-    \frac{2 a_{r'}^2 + a_r a_{r'} + 2 a_r^2}
-    {3 a_r a_{r'} (a_r + a_{r'})}
-    - \frac{2(a_{r'}^3 + a_r^3) \log(a_{r'} / a_r+1)}{3 a_r^2 a_{r'}^2}
-    + \frac{2 a_{r'} \log (a_{r'} / a_r)}{3 a_r^2} \\
-    &\quad=
-    \frac{2}{3 a_r} + \frac{2}{3 a_{r'}}
-    - \frac{2(a_r^3 + a_{r'}^3 ) \log(a_{r'} / a_{r}+1)}
-    {3 a_r^2 a_{r'}^2}
-    + \frac{2 a_{r'} \log (a_{r'} / a_{r})}{3 a_r^2} \\
-    &\quad=
-    \frac{2}{3 a_r}
-    + \frac{2}{3 a_{r'}}
-    - \frac{2 a_{r'} \log(a_{r} / a_{r'} + 1)}{3 a_r^2}
-    - \frac{2 a_r \log(a_{r'} / a_{r} + 1)}{3 a_{r'}^2} \\
-    &\quad=
-    \frac{2}{3 a_r}
-    \left(
-      1 - \frac{a_{r'}}{a_r}
-      \log\left(\frac{a_{r}}{a_{r'}} + 1\right)
-    \right)
-    + \frac{2}{3 a_{r'}}
-    \left(
-      1 - \frac{a_r }{a_{r'}}
-      \log\left(\frac{a_{r'}}{a_{r}} + 1\right)
-    \right).
-  \end{align*}
-  %
-  Finally, we conclude by giving the limiting variance.
-  %
-  \begin{align*}
-    &\sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \frac{n^2}{\lambda^d}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    &\quad=
-    \frac{\sigma^2(x)}{f(x)}
-    \sum_{r=0}^{J}
-    \sum_{r'=0}^{J}
-    \omega_r
-    \omega_{r'}
-    \left(
-      \frac{2 a_{r'}}{3}
-      \left(
-        1 - \frac{a_{r'}}{a_r}
-        \log\left(\frac{a_r}{a_{r'}} + 1\right)
-      \right)
-      + \frac{2 a_r}{3}
-      \left(
-        1 - \frac{a_r}{a_{r'}}
-        \log\left(\frac{a_{r'}}{a_r} + 1\right)
-      \right)
-    \right)^d \\
-    &\qquad+
-    O \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{\lambda^d \log n}{n}
-    \right).
-  \end{align*}
-  %
-  So the limit exists, and
-  with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}}
-  \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$,
-  the limiting variance is
-  %
-  \begin{align*}
-    \Sigma_\rd(x)
-    &=
-    \frac{\sigma^2(x)}{f(x)}
-    \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'}
-    \left( \ell_{r r'} + \ell_{r' r} \right)^d.
-  \end{align*}
-  %
-\end{proof}
-
-The new bias characterization with debiasing is an algebraic
-consequence of the original bias characterization and the construction
-of the debiased Mondrian random forest estimator.
-
-\begin{proof}[Theorem~\ref{thm:mondrian_bias_debiased}]
-
-  By the definition of the debiased estimator and
-  Theorem~\ref{thm:mondrian_bias}, since $J$ and $a_r$ are fixed,
-  %
-  \begin{align*}
-    \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
-    &=
-    \sum_{l=0}^J
-    \omega_l
-    \E \big[
-      \hat \mu_l(x)
-      \Bigm| \bX, \bT
-    \big] \\
-    &=
-    \sum_{l=0}^J
-    \omega_l
-    \left(
-      \mu(x)
-      + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
-      \frac{B_r(x)}{a_l^{2r} \lambda^{2r}}
-    \right)
-    + O_\P \left(
-      \frac{1}{\lambda^\beta}
-      + \frac{1}{\lambda \sqrt B}
-      + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{align*}
-  %
-  It remains to evaluate the first term.
-  Recalling that $A_{r s} = a_{r-1}^{2 - 2s}$
-  and $A \omega = e_0$, we have
-  %
-  \begin{align*}
-    &\sum_{l=0}^J
-    \omega_l
-    \left(
-      \mu(x)
-      + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
-      \frac{B_r(x)}{a_l^{2r} \lambda^{2r}}
-    \right) \\
-    &\quad=
-    \mu(x)
-    \sum_{l=0}^J
-    \omega_l
-    +
-    \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
-    \frac{B_r(x)}{\lambda^{2r}}
-    \sum_{l=0}^J
-    \frac{\omega_l}{a_l^{2r}} \\
-    &\quad=
-    \mu(x)
-    (A \omega)_1
-    + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor \wedge J}
-    \frac{B_r(x)}{\lambda^{2r}}
-    (A \omega)_{r+1}
-    + \sum_{r = (\lfloor \flbeta / 2 \rfloor \wedge J) + 1}
-    ^{\lfloor \flbeta / 2 \rfloor}
-    \frac{B_r(x)}{\lambda^{2r}}
-    \sum_{l=0}^J
-    \frac{\omega_l}{a_l^{2r}} \\
-    &\quad=
-    \mu(x)
-    + \I\{\lfloor \flbeta / 2 \rfloor \geq J + 1\}
-    \frac{B_{J+1}(x)}{\lambda^{2J + 2}}
-    \sum_{l=0}^J
-    \frac{\omega_l}{a_l^{2J + 2}}
-    + O \left( \frac{1}{\lambda^{2J + 4}} \right) \\
-    &\quad=
-    \mu(x)
-    + \I\{2J + 2 < \beta\}
-    \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}}
-    + O \left( \frac{1}{\lambda^{2J + 4}} \right).
-  \end{align*}
-  %
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation_debiased}]
-
-  \proofparagraph{consistency of $\hat\sigma^2(x)$}
-
-  Recall that
-  %
-  \begin{align}
-    \label{eq:mondrian_app_sigma2_hat_proof}
-    \hat\sigma^2(x)
-    &=
-    \frac{1}{B}
-    \sum_{b=1}^{B}
-    \frac{\sum_{i=1}^n Y_i^2 \, \I\{X_i \in T_b(x)\}}
-    {\sum_{i=1}^n \I\{X_i \in T_b(x)\}}
-    - \hat \mu(x)^2.
-  \end{align}
-  %
-  The first term in \eqref{eq:mondrian_app_sigma2_hat_proof}
-  is simply a Mondrian forest estimator of
-  $\E[Y_i^2 \mid X_i = x] = \sigma^2(x) + \mu(x)^2$,
-  which is bounded and Lipschitz,
-  where $\E[Y_i^4 \mid X_i]$ is bounded almost surely.
-  So its conditional bias is controlled
-  by Theorem~\ref{thm:mondrian_bias} and is at most
-  $O_\P \left( \frac{1}{\lambda} +
-  \frac{\log n}{\lambda} \sqrt{\lambda^d / n} \right)$.
-  Its variance is
-  at most $\frac{\lambda^d}{n}$ by Theorem~\ref{thm:mondrian_clt_debiased}.
-  Consistency of the second term in \eqref{eq:mondrian_app_sigma2_hat_proof}
-  follows directly from Theorems~\ref{thm:mondrian_bias} and
-  \ref{thm:mondrian_clt_debiased} with the same bias and variance bounds.
-  Therefore
-  %
-  \begin{align*}
-    \hat\sigma^2(x)
-    &=
-    \sigma^2(x)
-    + O_\P \left(
-      \frac{1}{\lambda}
-      + \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{align*}
-
-  \proofparagraph{consistency of the sum}
-  %
-  Note that
-  %
-  \begin{align*}
-    &\frac{n}{\lambda^d}
-    \sum_{i=1}^n
-    \left(
-      \sum_{r=0}^J
-      \omega_r
-      \frac{1}{B}
-      \sum_{b=1}^B
-      \frac{\I\{X_i \in T_{r b}(x)\}}
-      {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}}
-    \right)^2 \\
-    &\quad=
-    \frac{n}{\lambda^d}
-    \frac{1}{B^2}
-    \sum_{i=1}^n
-    \sum_{r=0}^J
-    \sum_{r'=0}^J
-    \omega_r
-    \omega_{r'}
-    \sum_{b=1}^B
-    \sum_{b'=1}^B
-    \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-    {N_{b r}(x) N_{b' r'}(x)}.
-  \end{align*}
-  %
-  This is exactly the same as the quantity in
-  \eqref{eq:mondrian_app_clt_condition_sum}, if we were to take
-  $\varepsilon_i$ to be $\pm 1$ with equal probability.
-  Thus we immediately have convergence in probability
-  by the proof of Theorem~\ref{thm:mondrian_clt_debiased}:
-  %
-  \begin{align*}
-    \frac{n}{\lambda^d}
-    \sum_{i=1}^n
-    \left(
-      \sum_{r=0}^J
-      \omega_r
-      \frac{1}{B}
-      \sum_{b=1}^B
-      \frac{\I\{X_i \in T_{r b}(x)\}}
-      {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}}
-    \right)^2
-    &=
-    \frac{n^2}{\lambda^d}
-    \sum_{r=0}^J
-    \sum_{r'=0}^J
-    \omega_r
-    \omega_{r'}
-    \E \left[
-      \frac{\I_{i b r}(x) \I_{i b' r'}(x)}
-      {N_{b r}(x) N_{b' r'}(x)}
-    \right] \\
-    &\quad+
-    O_\P \left(
-      \frac{1}{\sqrt B}
-      + \sqrt{\frac{\lambda^d \log n}{n}}
-    \right).
-  \end{align*}
-
-  \proofparagraph{conclusion}
-
-  By the proof of Theorem~\ref{thm:mondrian_clt_debiased}
-  with $\varepsilon_i$ being $\pm 1$ with equal probability,
-  and by previous parts,
-  %
-  \begin{align*}
-    \hat\Sigma_\rd(x)
-    = \Sigma_\rd(x)
-    + O_\P \left(
-      \frac{(\log n)^{d+1}}{\lambda}
-      + \frac{1}{\sqrt B}
-      + \sqrt{\frac{\lambda^d \log n}{n}}
-    \right).
-  \end{align*}
-
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_confidence_debiased}]
-  %
-  By Theorem~\ref{thm:mondrian_bias_debiased}
-  and Theorem~\ref{thm:mondrian_variance_estimation_debiased},
-  %
-  \begin{align*}
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu_\rd(x) - \mu(x)}{\hat \Sigma_\rd(x)^{1/2}}
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]}
-    {\hat \Sigma_\rd(x)^{1/2}}
-    + \sqrt{\frac{n}{\lambda^d}}
-    \frac{\E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x)}
-    {\hat \Sigma_\rd(x)^{1/2}} \\
-    &=
-    \sqrt{\frac{n}{\lambda^d}}
-    \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]}
-    {\hat \Sigma_\rd(x)^{1/2}}
-    + \sqrt{\frac{n}{\lambda^d}} \,
-    O_\P \left(
-      \frac{1}{\lambda^\beta}
-      + \frac{1}{\lambda \sqrt B}
-      + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
-    \right).
-  \end{align*}
-  %
-  The first term converges weakly to $\cN(0,1)$ by
-  Slutsky's theorem and Theorems~\ref{thm:mondrian_clt_debiased}
-  and \ref{thm:mondrian_variance_estimation_debiased},
-  while the second is $o_\P(1)$ by assumption.
-  Validity of the confidence interval follows.
-  %
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:mondrian_minimax}]
-
-  Theorem~\ref{thm:mondrian_bias_debiased}
-  and the proof of Theorem~\ref{thm:mondrian_clt_debiased}
-  with $J = \lfloor \flbeta / 2 \rfloor$ gives
-  %
-  \begin{align*}
-    \E \left[
-      \big(
-        \hat \mu_\rd(x)
-        - \mu(x)
-      \big)^2
-    \right]
-    &=
-    \E \left[
-      \big(
-        \hat \mu_\rd(x)
-        - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]
-      \big)^2
-    \right]
-    + \E \left[
-      \big(
-        \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]
-        - \mu(x)
-      \big)^2
-    \right] \\
-    &\lesssim
-    \frac{\lambda^d}{n}
-    + \frac{1}{\lambda^{2\beta}}
-    + \frac{1}{\lambda^2 B}.
-  \end{align*}
-  %
-  We use here an $L^2$ version of Theorem~\ref{thm:mondrian_bias_debiased}
-  which is immediate from the proof of Theorem~\ref{thm:mondrian_bias},
-  since we leveraged Chebyshev's inequality. Now since
-  $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$ and
-  $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$,
-  %
-  \begin{align*}
-    \E \left[
-      \big(
-        \hat \mu_\rd(x)
-        - \mu(x)
-      \big)^2
-    \right]
-    &\lesssim
-    n^{-\frac{2\beta}{d + 2 \beta}}.
-  \end{align*}
-\end{proof}
-
-\section{Further properties of the Mondrian process}
-
-In section, we state and prove a collection of lemmas concerning
-various properties of the Mondrian process. While they are not used directly
-in our analysis of Mondrian random forest estimators, we believe that
-these results, along with the techniques displayed during their proofs,
-may be of potential independent interest.
-
-Our analysis of Mondrian random forest estimators in the main text
-is for the most part
-conducted pointwise, in the sense that we first fix $x \in [0,1]^d$
-and then analyze $\hat\mu(x)$. This means that we interact with the Mondrian
-process
-only through $T(x)$; that is, the cell in $T$ which contains the point $x$.
-As such, we rely only on local properties of $T$, and may consider just a
-single Mondrian cell. The lemmas in this section take a more global approach
-to analyzing the Mondrian process, and we make statements about the
-entire process $T$, rather than individual cells $T(x)$.
-Such results may be useful for a future investigation of the uniform
-properties of Mondrian forest estimators, as well as
-being interesting in their own right.
-
-We begin with a tail bound for the number of cells appearing
-in a Mondrian tree, offering a multiplicative
-exponential inequality which
-complements the exact expectation result given in
-\citet[Proposition~2]{mourtada2020minimax}.
-The resulting bound in probability is the same up to
-logarithmic terms, and the sharp tail decay is useful
-in combination with union bounds in our upcoming results.
-
-\begin{lemma}[Tail bound for the number of cells in a Mondrian tree]
-  \label{lem:mondrian_app_cells_tail}
-
-  Let $D \subseteq \R^d$ be a rectangle and
-  $T \sim \cM(D, \lambda)$. Writing
-  $\# T$ for the number of cells in $T$,
-  %
-  \begin{align*}
-    \P\left(
-      \# T > 3 (1 + \lambda |D|_1)^d
-      (t + 1 + d \log(1 + \lambda |D|_1))
-    \right)
-    &\leq
-    e^{-t}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_cells_tail}]
-
-  We refer to this method as the ``subcell trick''
-  and attribute it to \citet{mourtada2017universal}.
-  For $\varepsilon > 0$, partition $D$ into
-  at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$
-  with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$.
-  Denote the restriction of a tree $T$ to a subcell $D'$ by $T \cap D'$.
-  Since a split in $T$ induces a split in at least one $T \cap D'$,
-  by a union bound
-  %
-  \begin{align*}
-    \P\left(\# T > t \right)
-    &\leq
-    \P\left(\sum_{D' \in \cD_\varepsilon}
-    \# (T \cap D') > t \right)
-    \leq
-    \sum_{D' \in \cD_\varepsilon}
-    \P\left(
-      \# (T \cap D') >
-      \frac{t}{\# \cD_\varepsilon}
-    \right).
-  \end{align*}
-  %
-  Now $\# (T \cap D')$ is dominated by a Yule process
-  with parameter $|D'|_1$ stopped at time $\lambda$
-  \citep[proof of Lemma~2]{mourtada2017universal},
-  so using that fact that if
-  $X \sim \Yule(a)$
-  then $\P(X_t > n) \leq (1-e^{-at})^{n-1}$,
-  %
-  \begin{align*}
-    \P\left(\# T > t \right)
-    &\leq
-    \# \cD_\varepsilon \,
-    (1 - e^{-\lambda |D|_1 \varepsilon})^{t / \# \cD_\varepsilon - 1}
-    \leq
-    (1 + 1/\varepsilon)^d
-    (1 - e^{-\lambda |D|_1 \varepsilon})^{t (1 + 1/\varepsilon)^{-d} - 1}.
-  \end{align*}
-  %
-  Set $\varepsilon = \frac{1}{\lambda |D|_1}$,
-  note $1-1/e \leq e^{-1/3}$
-  and replace $t$ by
-  $3 (1 + \lambda |D|_1)^d
-  (t + 1 + d \log(1 + \lambda |D|_1))$:
-  %
-  \begin{align*}
-    &\P\left(\# T > t \right)
-    \leq
-    (1 + \lambda |D|_1)^d
-    (1 - 1/e)^{t (1 + \lambda |D|_1)^{-d} - 1}
-    \leq
-    2 (1 + \lambda |D|_1)^d
-    e^{-t (1 + \lambda |D|_1)^{-d} / 3}, \\
-    &\P\left(\# T >
-      3
-      (1 + \lambda |D|_1)^d
-      (t + 1 + d \log(1 + \lambda |D|_1))
-    \right)
-    \leq
-    e^{-t}.
-  \end{align*}
-  %
-\end{proof}
-
-Next we provide a rigorous justification to the observation that the cells
-in a Mondrian process should have the same shape distribution, though
-of course they are not independent. To state and prove this result,
-we need a way to identify a particular cell by endowing the
-cells in a Mondrian tree with a natural order.
-
-\begin{definition}[Canonical order of cells in a Mondrian tree]
-  Let $T \sim \cM(D, \lambda)$.
-  Each cell in a fixed realization of $T$ can be described by
-  a word from the alphabet $\{l, r\}$,
-  where $l$ indicates the cell to the left of a split
-  and $r$ indicates the cell to the right.
-  For example, if there are no splits we have one cell
-  described by the empty word.
-  After one split there are two cells, denoted
-  $l$ and $r$.
-  Now suppose that the cell $r$ splits again, giving two splits and three cells,
-  denoted $l$, $r l$, and $r r$.
-  Define the canonical ordering of the cells of $T$ by applying
-  the lexicographic order to their words, with $l < r$.
-  Note that it does not matter which coordinate each split occurs in:
-  in two dimensions, $l$ might refer to the ``left'' or ``bottom''
-  and $r$ to the ``right'' or ``top'' cell.
-\end{definition}
-
-\begin{lemma}[Cells in a Mondrian tree have identically distributed shapes]
-  \label{lem:mondrian_app_cells_identically_distributed}
-
-  Let $T \sim \cM(D, \lambda)$
-  with ordered cells $D'_1, \ldots, D'_{\# T}$.
-  For $\varepsilon_1, \ldots, \varepsilon_d \geq 0$
-  and $1 \leq i \leq k$,
-  %
-  \begin{align*}
-    \P\left(
-      |D'_{i1}| \leq \varepsilon_1,
-      \ldots, |D'_{id}| \leq \varepsilon_d,
-      \# T = k
-    \right)
-    &=
-    \P\left(
-      |D'_{11}| \leq \varepsilon_1,
-      \ldots, |D'_{1d}| \leq \varepsilon_d,
-      \# T = k
-    \right).
-  \end{align*}
-  %
-  Marginalizing over $\# T$
-  with $E_j$ i.i.d.\ $\Exp(1)$,
-  \citet[Proposition~1]{mourtada2020minimax} gives
-  %
-  \begin{align*}
-    \P\left(
-      |D'_{i1}| > \varepsilon_1,
-      \ldots, |D'_{id}| > \varepsilon_d
-    \right)
-    &=
-    \prod_{j=1}^d
-    \P\left(
-      \frac{E_j}{\lambda} \wedge |D_j|
-      > \varepsilon_j
-    \right)
-    = \prod_{j=1}^d
-    \I\{|D_j| > \varepsilon_j\}
-    e^{-\lambda \varepsilon_j}.
-  \end{align*}
-
-\end{lemma}
-
-We observe a version of the famous Poisson process inspection or waiting time
-paradox in the sizes of Mondrian cells. The above
-Lemma~\ref{lem:mondrian_app_cells_identically_distributed} shows that for a
-large enough
-lifetime $\lambda$, the volume of any cell $D$ has the same distribution as the
-volume of a corner cell, and is asymptotically
-$\E[|D|] \asymp \E \left[ \prod_{j=1}^{d} (E_j / \lambda) \right]
-= 1/\lambda^d$.
-This is consistent with \citet[Proposition~2]{mourtada2020minimax} who give
-$\E[\# T] \asymp \lambda^d$.
-However, if instead of selecting a cell directly,
-we instead select a fixed interior point $x$
-and query the cell $T(x)$ which contains it, we find that
-$\E[|T(x)|] \asymp \E \left[
-\prod_{j=1}^{d} ((E_{1j} + E_{2j}) / \lambda) \right]
-= 2^d/\lambda^d$, where $E_{1j}, E_{2j}$ are i.i.d.\ $\Exp(1)$,
-by \citet[Proposition~1]{mourtada2020minimax}.
-Since $T(x)$ contains $x$ by construction, a size-biasing phenomenon occurs
-and we see that $T(x)$ is on average larger than a typical Mondrian cell.
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_cells_identically_distributed}]
-
-  Let $w$ be the word associated with the cell $D_i \in T$.
-  Note that $i=1$ if and only if $r \notin w$, as then $D_i$ is the left child
-  of every split.
-  So suppose $r \in w$.
-  Let $\tilde w$ be the word obtained by replacing all occurrences
-  of $r$ in $w$ with an $l$.
-  Each such replacement corresponds to a split in $T$.
-  Let $\tilde T$ be the same process as $T$ but with the following
-  modification: for each split where a replacement was made,
-  change the uniform random variable $S$
-  (from the definition of $T$, see Section~\ref{sec:mondrian_process}) to $1-S$.
-  Since $S$ is independent of everything else in the construction of $T$,
-  we observe that $\tilde T \sim \cM(D, \lambda)$ also.
-  Further, there is almost surely exactly one cell in $\tilde T$
-  which has the same shape as $D$, as the uniform distribution has no atoms.
-  Denote this cell by $\tilde D$ and note that
-  the replacements imply that its word in $\tilde T$
-  is $\tilde w$.
-  Thus $\tilde D = \tilde D_1$ in $\tilde T$ and so
-  $(|D_{i1}|, \ldots, |D_{i d}|, \# T)
-  = (|\tilde D_{11}|, \ldots, |\tilde D_{1d}|, \# \tilde T)$.
-  Equality of the distributions follows.
-\end{proof}
-
-As our next result we provide a tail bound for the size of the largest
-Mondrian cell. The cells within a Mondrian tree are of course not independent,
-and in fact there should intuitively be some negative correlation between their
-sizes, due to the fact that they must all fit within the original cell $D$.
-
-\begin{lemma}[Tail bound on largest Mondrian cell]
-  \label{lem:mondrian_app_largest_cell_tail}
-
-  Let $T \sim \cM(D, \lambda)$.
-  For any $\varepsilon > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      \max_{D' \in T}
-      \max_{1 \leq j \leq d}
-      |D'_j| > \varepsilon
-    \right)
-    &\leq
-    5d (1 + \lambda |D|_1)^{d+1}
-    e^{-\lambda \varepsilon}.
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell_tail}]
-
-  Let $D_i$ be the ordered cells of $T$ and take $k \geq 1$.
-  By union bounds and
-  Lemma~\ref{lem:mondrian_app_cells_identically_distributed},
-  %
-  \begin{align*}
-    \P\left(
-      \max_{D' \in T}
-      \max_{1 \leq j \leq d}
-      |D'_j| > \varepsilon
-    \right)
-    &\leq
-    \sum_{l=1}^k
-    \P\left(
-      \max_{1 \leq i \leq l}
-      \max_{1 \leq j \leq d}
-      |D_{i j}| > \varepsilon,
-      \# T = l
-    \right)
-    + \P\left( \# T > k \right) \\
-    &\leq
-    \sum_{l=1}^k
-    \sum_{i=1}^l
-    \sum_{j=1}^d
-    \P\big(
-      |D_{i j}| > \varepsilon,
-      \# T = l
-    \big)
-    + \P\left( \# T > k \right) \\
-    &\leq
-    \sum_{l=1}^k
-    l d \,
-    \P\big(
-      |D_{1j}| > \varepsilon,
-      \# T = l
-    \big)
-    + \P\left( \# T > k \right) \\
-    &\leq
-    k d \,
-    \P\big(|D_{1 j}| > \varepsilon \big)
-    + \P\left( \# T > k \right).
-  \end{align*}
-  %
-  For the first term we use the exact distribution of
-  $D_1$ from Lemma~\ref{lem:mondrian_app_cells_identically_distributed}
-  and for the second term we apply Lemma~\ref{lem:mondrian_app_cells_tail}.
-  %
-  \begin{align*}
-    \P\left(
-      \max_{D' \in T}
-      \max_{1 \leq j \leq d}
-      |D'_j| > \varepsilon
-    \right)
-    &\leq
-    k d \, \P\big(|D_{1 j}| > \varepsilon \big)
-    + \P\left( \# T > k \right) \\
-    &\leq
-    k d \, e^{-\lambda \varepsilon}
-    + 2 (1 + \lambda |D|_1)^d
-    e^{-k (1 + \lambda |D|_1)^{-d} / 3}.
-  \end{align*}
-  %
-  Finally, set
-  $k = \big\lceil 3 \lambda \varepsilon (1 + \lambda |D|_1)^d \big\rceil$
-  and note the bound is trivial unless $\varepsilon \leq |D|_1$.
-  %
-  \begin{align*}
-    \P\left(
-      \max_{D' \in T}
-      \max_{1 \leq j \leq d}
-      |D'_j| > \varepsilon
-    \right)
-    &\leq
-    \big( 3 \lambda \varepsilon (1 + \lambda |D|_1)^d + 1 \big)
-    d \, e^{-\lambda \varepsilon}
-    + 2 (1 + \lambda |D|_1)^d
-    e^{-\lambda \varepsilon} \\
-    &\leq
-    3d (1 + \lambda |D|_1)^{d+1}
-    e^{-\lambda \varepsilon}
-    + 2 (1 + \lambda |D|_1)^d
-    e^{-\lambda \varepsilon} \\
-    &\leq
-    5d (1 + \lambda |D|_1)^{d+1}
-    e^{-\lambda \varepsilon}.
-  \end{align*}
-  %
-\end{proof}
-
-For the remainder of this section, we turn our attention to the partitions
-generated by Mondrian random forests. In particular, we study the refinement
-generated by overlaying $B$ independent Mondrian processes with possibly
-different lifetime parameters, and intersecting their resulting individual
-partitions.
-
-\begin{definition}[Partition refinement]%
-  %
-  Let $T_1, \ldots, T_B$ be partitions of a set.
-  Their common refinement is
-  %
-  \begin{align*}
-    \bigwedge_{b=1}^B T_b
-    = \left\{
-      \bigcap_{b=1}^B D_b:
-      D_b \in T_b
-    \right\}
-    \bigsetminus
-    \left\{ \emptyset \right\}.
-  \end{align*}
-  %
-\end{definition}
-
-We begin our analysis of Mondrian forest refinements with a pair of simple
-inequalities for bounding the total number of refined cells
-in Lemma~\ref{lem:mondrian_app_refinement_inequalities}. This result does not
-depend
-on the probabilistic structure of the Mondrian process, and holds for any
-rectangular partitions.
-
-\begin{lemma}[Inequalities for refinements of rectangular partitions]
-  \label{lem:mondrian_app_refinement_inequalities}
-
-  Let $T_1, \ldots, T_B$ be rectangular partitions of a $d$-dimensional
-  rectangle $D$. Then
-  %
-  \begin{align}
-    \label{eq:mondrian_app_refinement_1}
-    \# \bigwedge_{b=1}^B T_b
-    &\leq \prod_{b=1}^B \# T_b,
-  \end{align}
-  %
-  and for all $B \leq d$ there exist $T_b$ such that
-  \eqref{eq:mondrian_app_refinement_1} holds with equality.
-  If $\# T_{b j}$ denotes the number of splits
-  made by $T_b$ in dimension $j$, then
-  %
-  \begin{align}
-    \label{eq:mondrian_app_refinement_2}
-    \# \bigwedge_{b=1}^B T_b
-    &\leq \prod_{j=1}^d
-    \left( 1 + \sum_{b=1}^B \# T_{b j} \right),
-  \end{align}
-  %
-  and for all $B \geq d$ there exist $T_b$ such that
-  \eqref{eq:mondrian_app_refinement_2} holds with equality.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_inequalities}]
-
-  The first inequality \eqref{eq:mondrian_app_refinement_1}
-  follows because every cell in
-  $\bigwedge_b T_b$ is the intersection of cells
-  $D_b \in T_b$ for $1 \leq b \leq B$, and there at at most
-  $\prod_{b=1}^{B} \# T_b$ ways to choose these.
-  This bound is achievable when $B \leq d$ by setting
-  $T_b$ to be a tree with splits only in dimension $b$,
-  so that every such intersection of cells
-  gives a cell in the refinement.
-
-  For the second inequality \eqref{eq:mondrian_app_refinement_2},
-  we construct a new forest of trees.
-  In particular, for each $1 \leq j \leq d$ define
-  $A_j$ to be the set of locations in $D_j$ where a tree $T_b$
-  makes a split in dimension $j$ for some $b$.
-  Define $T'_j$ to be a tree which has splits
-  only in dimension $j$ and at the locations prescribed by $A_j$.
-  Clearly, since every split in $T'_j$
-  comes from a split in some $T_b$ in dimension $j$,
-  we have $\# T'_j \leq 1 + \sum_b \# T_{b j}$.
-  Applying the first inequality to this new forest yields
-  $\# \bigwedge_j T'_j \leq \prod_j \# T'_j
-  \leq \prod_j \big( 1 + \sum_b \# T_{b j} \big)$.
-  Finally, note that $\bigwedge_j T'_j$
-  is a refinement of $\bigwedge_b T_b$ and the result follows.
-  This bound is achievable when $B \geq d$ by letting
-  $T_b$ have splits only in dimension $b$ when $b \leq d$
-  and to be the trivial partition otherwise.
-  %
-\end{proof}
-
-The inequalities in Lemma~\ref{lem:mondrian_app_refinement_inequalities} provide
-rather crude bounds for the number of cells in a Mondrian forest
-refinement as they do not take into account the random structure.
-Indeed, it should be clear that the ``worst case'' scenarios, involving
-trees which contain splits only in a single direction, should be extremely
-unlikely under the Mondrian law. In Lemma~\ref{lem:mondrian_app_refinement} we
-confirm
-this intuition and provide an exact value for the expected number of cells
-in a Mondrian refinement by direct calculation. This result strictly generalizes
-the single tree version provided as \citet[Proposition~2]{mourtada2020minimax}.
-
-\begin{lemma}[Expected number of cells in a Mondrian forest refinement]
-  \label{lem:mondrian_app_refinement}
-
-  Let $D$ be a $d$-dimensional rectangle
-  and take $\lambda_b > 0$ for $1 \leq b \leq B$.
-  Let $T_b \sim \cM(D, \lambda_b)$ be independent.
-  Then the expected number of cells in their refinement is exactly
-  %
-  \begin{align*}
-    \E\left[\# \bigwedge_{b=1}^B T_b \right]
-    &= \prod_{j=1}^d \left(
-      1 + |D_j| \sum_{b=1}^B \lambda_b
-    \right).
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement}]
-
-  By \citet[Proposition~2]{mourtada2020minimax}
-  we have the result for a single tree:
-  %
-  \begin{align}
-    \label{eq:mondrian_app_single_tree}
-    \E\left[\# T_b \right]
-    &= \prod_{j=1}^d \left(
-      1 + |D_j| \lambda_b
-    \right).
-  \end{align}
-  %
-  We proceed by induction on $B$.
-  By the tower law,
-  %
-  \begin{align*}
-    \E\left[\# \bigwedge_{b=1}^B T_b \right]
-    &=
-    \E\left[
-      \sum_{D' \in T_B}
-      \#
-      \bigwedge_{b=1}^{B-1} (T_b \cap D')
-    \right]
-    = \E\left[
-      \sum_{D' \in T_B}
-      \E\left[
-        \#
-        \bigwedge_{b=1}^{B-1} (T_b \cap D')
-        \biggm| T_B
-      \right]
-    \right].
-  \end{align*}
-  %
-  Now by the restriction property of Mondrian processes
-  \citep[Fact~2]{mourtada2020minimax},
-  observe that $T_b \cap D' \sim \cM(D', \lambda_b)$
-  conditional on $T_B$.
-  Then by the induction hypothesis,
-  %
-  \begin{align*}
-    \E\left[
-      \#
-      \bigwedge_{b=1}^{B-1} (T_b \cap D')
-      \biggm| T_B
-    \right]
-    &=
-    \prod_{j=1}^d \left(
-      1 + |D'_j| \sum_{b=1}^{B-1} \lambda_b
-    \right)
-    = \E\big[
-      \# T_{D'} \mid T_B
-    \big]
-  \end{align*}
-  %
-  where $T_{D'} \sim \cM\big(D', \sum_{b=1}^{B-1} \lambda_B\big)$
-  conditional on $T_B$,
-  by the result for a single tree \eqref{eq:mondrian_app_single_tree}.
-  The restriction property finally shows that there exist realizations
-  of $T_{D'}$ which ensure that
-  $\sum_{D' \in T_B} \# T_{D'}$ is equal in distribution
-  to $\# T$, where $T \sim \cM(D, \sum_{b=1}^B \lambda_b)$,
-  so by \eqref{eq:mondrian_app_single_tree},
-  %
-  \begin{align*}
-    \E\left[\# \bigwedge_{b=1}^B T_b \right]
-    &=
-    \E\left[
-      \sum_{D' \in T_B}
-      \E\big[
-        \# T_{D'} \mid T_B
-      \big]
-    \right]
-    =
-    \E\big[\# T \big]
-    = \prod_{j=1}^d \left(
-      1 + |D_j| \sum_{b=1}^B \lambda_b
-    \right).
-  \end{align*}
-  %
-\end{proof}
-
-While the exact expectation calculation in
-Lemma~\ref{lem:mondrian_app_refinement} is neat,
-sharper control on the tail
-behavior of the number of cells in a Mondrian refinement is desired.
-Lemma~\ref{lem:mondrian_app_refinement_tail} provides this, again
-making use of the subcell trick to convert a crude bound based on
-Lemma~\ref{lem:mondrian_app_refinement_inequalities} into a useful tail
-inequality.
-We assume for simplicity that all of the lifetimes are identical.
-
-\begin{lemma}[Tail bound on the number of cells in a Mondrian forest refinement]
-  \label{lem:mondrian_app_refinement_tail}
-
-  Let $T_b \sim \cM(D, \lambda)$ be i.i.d.\ for $1 \leq b \leq B$. Then
-  %
-  \begin{align*}
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d
-    \right)
-    &\leq
-    2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_tail}]
-
-  We begin with a coarse estimate and refine it with the subcell trick.
-  By Lemma~\ref{lem:mondrian_app_refinement_inequalities}
-  \eqref{eq:mondrian_app_refinement_2},
-  for any $t > 0$, recalling that $\# T_{b j}$ is the number
-  of splits made by $T_b$ in dimension $j$,
-  %
-  \begin{align}
-    \nonumber
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > t
-    \right)
-    &\leq
-    \P\left(
-      \prod_{j=1}^d
-      \left(
-        1 + \sum_{b=1}^B \# T_{b j}
-      \right)
-      > t
-    \right)
-    \leq
-    \sum_{j=1}^d
-    \P\left(
-      1 + \sum_{b=1}^B \# T_{b j}
-      > t^{1/d}
-    \right) \\
-    \label{eq:mondrian_app_refinement_tail_coarse}
-    &\leq
-    d\, \P\left(
-      \sum_{b=1}^B \# T_b
-      > t^{1/d}
-    \right)
-    \leq
-    d B\,
-    \P\left(
-      \# T_b > \frac{t^{1/d}}{B}
-    \right).
-  \end{align}
-  %
-  By the subcell trick, partition $D$ into
-  at most $(1 + 1/\varepsilon)^d$ cells
-  $D' \in \cD_\varepsilon$ with side lengths at most
-  $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$.
-  As every cell in $\bigwedge_b T_b$ corresponds to
-  at least one cell in $\bigwedge_b (T_b \cap D')$,
-  %
-  \begin{align*}
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > t
-    \right)
-    &\leq
-    \P\left(
-      \sum_{D' \in \cD_\varepsilon}
-      \# \bigwedge_{b=1}^B (T_b \cap D')
-      > t
-    \right)
-    \leq
-    \sum_{D' \in \cD_\varepsilon}
-    \P\left(
-      \# \bigwedge_{b=1}^B (T_b \cap D')
-      > \frac{t}{\# \cD_\varepsilon}
-    \right).
-  \end{align*}
-  %
-  Applying the coarse estimate \eqref{eq:mondrian_app_refinement_tail_coarse}
-  to $\# \bigwedge_b (T_b \cap D')$ gives
-  %
-  \begin{align*}
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > t
-    \right)
-    &\leq
-    d B \sum_{D' \in \cD_\varepsilon}
-    \P\left(
-      \# (T_b \cap D')
-      > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}}
-    \right).
-  \end{align*}
-  %
-  Now apply Lemma~\ref{lem:mondrian_app_cells_tail}
-  and set $\varepsilon = \frac{1}{\lambda |D|_1}$ to obtain
-  %
-  \begin{align*}
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > t
-    \right)
-    &\leq
-    d B \sum_{D' \in \cD_\varepsilon}
-    \P\left(
-      \# (T_b \cap D')
-      > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}}
-    \right) \\
-    &\leq
-    d B \sum_{D' \in \cD_\varepsilon}
-    2 (1 + \lambda |D'|_1)^d
-    e^{- t^{1/d} \# \cD_\varepsilon^{-1/d} B^{-1}
-    (1 + \lambda |D'|_1)^{-d} / 3} \\
-    &\leq
-    2 d B (1 + 1 / \varepsilon)^d
-    (1 + \lambda \varepsilon |D|_1)^d
-    e^{- t^{1/d} (1 + 1/\varepsilon)^{-1} B^{-1}
-    (1 + \lambda \varepsilon |D|_1)^{-d} / 3} \\
-    &\leq
-    2^{d+1} d B (1 + \lambda |D|_1)^d
-    e^{- t^{1/d} (1 + \lambda |D|_1)^{-1} B^{-1} 2^{-d} / 3}.
-  \end{align*}
-  %
-  Finally, replacing $t$ by $3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d$ we have
-  %
-  \begin{align*}
-    \P\left(
-      \# \bigwedge_{b=1}^B T_b
-      > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d
-    \right)
-    &\leq
-    2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}.
-  \end{align*}
-  %
-\end{proof}
-
-\chapter{Supplement to Dyadic Kernel Density Estimators}
-\label{app:kernel}
-
-This section contains complementary detailed expositions of some
-of our main results, along with additional technical lemmas
-which may be of independent interest. We also provide full proofs
-for all of our theoretical contributions.
-
-\section{Supplementary main results}
-
-In this first section we provide more detailed versions of some of the
-results presented in the main text, alongside some intermediate
-lemmas which were skipped for conciseness.
-We begin with some extra notation used throughout this appendix.
-
-For real vectors,
-$\|\cdot\|_p$ is the standard $\ell^p$-norm defined for $p \in [1, \infty]$.
-For real square matrices,
-$\|\cdot\|_p$ is the operator
-norm induced by the corresponding vector norm.
-In particular,
-$\|\cdot\|_1$
-is the maximum absolute column sum,
-$\|\cdot\|_\infty$
-is the maximum absolute row sum,
-and $\|\cdot\|_2$
-is the maximum singular value.
-For real symmetric matrices,
-$\|\cdot\|_2$
-coincides with the maximum absolute eigenvalue.
-We use $\|\cdot\|_{\max}$
-to denote the largest absolute entry of a real matrix.
-For real-valued functions,
-$\|\cdot\|_\infty$
-denotes the (essential) supremum norm.
-For a bounded set $\cX \subseteq \R$ and $a \geq 0$
-we use $[\cX \pm a]$ to denote the compact interval
-$[\inf \cX - a, \ \sup \cX + a]$.
-For measurable subsets of $\R^d$
-we use $\Leb$ to denote the Lebesgue measure,
-and for finite sets we use $|\cdot|$
-for the cardinality.
-Write $\sum_i$
-for $\sum_{i=1}^n$
-when clear from context.
-Similarly, use $\sum_{i<j}$
-for $\sum_{i=1}^{n-1} \sum_{j=i+1}^n$
-and $\sum_{i<j<r}$
-for $\sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \sum_{r=j+1}^n$.
-
-\subsection{Strong approximation}
-\label{sec:kernel_app_strong_approx}
-
-We give a detailed construction of the
-strong approximation of the dyadic empirical process $\hat f_W$.
-We begin by using the
-K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation
-to obtain a strong approximation for $L_n$
-in Lemma~\ref{lem:kernel_app_strong_approx_Ln}.
-Since $E_n$ is an empirical process of i.n.i.d.\ variables,
-the KMT approximation is not valid.
-Instead we apply a conditional version of
-Yurinskii's coupling to obtain a
-conditional strong approximation for $E_n$
-in Lemma~\ref{lem:kernel_app_conditional_strong_approx_En},
-and then construct an unconditional
-strong approximation for $E_n$
-in Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}.
-These approximations are combined to give a
-strong approximation for $\hat f_W$
-in Theorem~\ref{thm:kernel_app_strong_approx_fW}.
-We do not need to approximate
-the negligible $Q_n$.
-
-This section is largely concerned with
-distributional properties,
-and, as such, will frequently involve
-\emph{copies} of processes.
-We say that $X'$ is a copy of a random variable $X$
-if they have the same distribution,
-though they may be defined on different probability spaces.
-To ensure that all of the joint distributional properties of
-such processes are preserved,
-we also carry over a copy of the latent variables
-$(\bA_n, \bV_n)$
-to the new space.
-
-Many of the technical details regarding
-the copying and embedding of stochastic processes
-are covered by the
-Vorob'ev--Berkes--Philipp theorem,
-which is stated and discussed in
-Lemma~\ref{lem:kernel_app_vbp}.
-In particular, this theorem can be used
-for random vectors or for stochastic processes
-indexed by a compact rectangle in $\R^d$
-with a.s.\ continuous sample paths.
-
-We present more detailed versions of
-Lemmas~\ref{lem:kernel_strong_approx_Ln},
-\ref{lem:kernel_conditional_strong_approx_En},
-and \ref{lem:kernel_unconditional_strong_approx_En}
-as Lemmas~\ref{lem:kernel_app_strong_approx_Ln},
-\ref{lem:kernel_app_conditional_strong_approx_En},
-and \ref{lem:kernel_app_unconditional_strong_approx_En}
-respectively, taking care to describe how copies of
-the stochastic processes are constructed,
-and also providing smoothness properties for
-the resulting sample paths.
-
-\begin{lemma}[Strong approximation of $L_n$]
-  \label{lem:kernel_app_strong_approx_Ln}
-
-  Suppose that Assumptions
-  \ref{ass:kernel_data}
-  and
-  \ref{ass:kernel_bandwidth} hold.
-  For each $n \geq 2$
-  there exists
-  on some probability space
-  a copy of $\big(\bA_n, \bV_n, L_n\big)$,
-  denoted $\big(\bA_n', \bV_n', L_n'\big)$,
-  and a mean-zero Gaussian process
-  $Z^{L\prime}_n$
-  indexed on $\cW$ satisfying
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \big| \sqrt{n} L'_n(w) - Z_n^{L\prime}(w)\big|
-      > \Du
-      \frac{t + C_1 \log n}{\sqrt{n}}
-    \right)
-    &\leq C_2 e^{-C_3 t},
-  \end{align*}
-  %
-  for some positive constants
-  $C_1$, $C_2$, $C_3$,
-  and for all $t > 0$.
-  By integration of tail probabilities,
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big| \sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\big|
-    \right]
-    &\lesssim
-    \frac{\Du \log n}{\sqrt{n}}.
-  \end{align*}
-  %
-  Further,
-  $Z_n^{L\prime}$ has the same covariance structure as
-  $\sqrt{n} L_n'$ in the sense that for all $w, w' \in \cW$,
-  %
-  \begin{align*}
-    \E\left[
-      Z_n^{L\prime}(w)
-      Z_n^{L\prime}(w')
-    \right]
-    &=
-    n
-    \E\left[
-      L_n'(w)
-      L_n'(w')
-    \right].
-  \end{align*}
-  %
-  It also satisfies the following
-  trajectory regularity property
-  for any $\delta_n \in (0, 1/2]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|
-      Z_n^{L\prime}(w)
-      - Z_n^{L\prime}(w')
-      \big|
-    \right]
-    &\lesssim
-    \Du
-    \delta_n \sqrt{\log 1/\delta_n},
-  \end{align*}
-  %
-  and has continuous trajectories.
-  The process $Z_n^{L\prime}$
-  is a function only of $\bA_n'$
-  and some random noise
-  which is independent of $(\bA_n', \bV_n')$.
-
-\end{lemma}
-
-\begin{lemma}[Conditional strong approximation of $E_n$]
-  \label{lem:kernel_app_conditional_strong_approx_En}
-
-  Suppose Assumptions
-  \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold.
-  For $n \geq 2$
-  and $t_n > 0$ with $\left|\log t_n\right| \lesssim \log n$,
-  there exists on some probability space
-  a copy of
-  $\big(\bA_n, \bV_n, E_n\big)$,
-  denoted
-  $\big(\bA_n', \bV_n', E_n'\big)$,
-  and a process
-  $\tilde Z^{E\prime}_n$
-  which is Gaussian conditional on $\bA_n'$
-  and mean-zero conditional on $\bA_n'$,
-  satisfying
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \big|
-      \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w)
-      \big|
-      > t_n
-      \Bigm\vert \bA_n'
-    \right)
-    &\leq
-    C_1
-    t_n^{-2}
-    n^{-1/2}
-    h^{-3/4}
-    (\log n)^{3/4},
-  \end{align*}
-  $\bA_n'$-almost surely
-  for some constant $C_1 > 0$.
-  Setting $t_n = n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$
-  for any sequence $R_n \to \infty$
-  and taking an expectation gives
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \big|
-    \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w)
-    \big|
-    &\lesssim_\P
-    n^{-1/4}
-    h^{-3/8} (\log n)^{3/8} R_n.
-  \end{align*}
-  %
-  Further,
-  $\tilde Z_n^{E\prime}$ has the same conditional covariance as
-  $\sqrt{n^2h} E_n'$ in that for all $w, w' \in \cW$,
-  %
-  \begin{align*}
-    \E\left[
-      \tilde Z_n^{E\prime}(w)
-      \tilde Z_n^{E\prime}(w')
-      \bigm\vert \bA_n'
-    \right]
-    &=
-    n^2h
-    \E\left[
-      E_n'(w)
-      E_n'(w')
-      \bigm\vert \bA_n'
-    \right].
-  \end{align*}
-  %
-  It also satisfies the following
-  trajectory regularity property
-  for any $\delta_n \in (0, 1/(2h)]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|
-      \tilde Z_n^{E\prime}(w)
-      - \tilde Z_n^{E\prime}(w')
-      \big|
-    \right]
-    &\lesssim
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h\delta_n}},
-  \end{align*}
-  %
-  and has continuous trajectories.
-
-\end{lemma}
-
-\begin{lemma}[Unconditional strong approximation of $E_n$]
-  \label{lem:kernel_app_unconditional_strong_approx_En}
-
-  Suppose Assumptions
-  \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold.
-  Let $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$
-  be defined as in
-  Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}.
-  For each $n \geq 2$
-  there exists
-  (on some probability space)
-  a copy of
-  $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$,
-  denoted
-  $\big(\bA_n'', \bV_n'', \tilde Z_n^{E\dprime}\big)$,
-  and a centered
-  Gaussian process
-  $Z^{E\dprime}_n$
-  satisfying
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-    \right]
-    &\lesssim
-    n^{-1/6} (\log n)^{2/3}.
-  \end{align*}
-  %
-  Further,
-  $Z_n^{E\dprime}$ has the same
-  (unconditional) covariance structure as
-  $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$
-  in the sense that for all $w, w' \in \cW$,
-  %
-  \begin{align*}
-    \E\left[
-      Z_n^{E\dprime}(w)
-      Z_n^{E\dprime}(w')
-    \right]
-    &=
-    \E\left[
-      \tilde Z_n^{E\dprime}(w)
-      \tilde Z_n^{E\dprime}(w')
-    \right]
-    =
-    n^2h \,
-    \E\left[
-      E_n(w)
-      E_n(w')
-    \right].
-  \end{align*}
-  %
-  It also satisfies the following
-  trajectory regularity property
-  for any $\delta_n \in (0, 1/(2h)]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|
-      Z_n^{E\dprime}(w)
-      - Z_n^{E\dprime}(w')
-      \big|
-    \right]
-    &\lesssim
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h\delta_n}}.
-  \end{align*}
-  %
-  Finally, $Z_n^{E\dprime}$ is independent of $\bA_n''$
-  and has continuous trajectories.
-
-\end{lemma}
-
-We combine these strong approximations to deduce a coupling for $\hat f_W$ in
-Theorem~\ref{thm:kernel_app_strong_approx_fW}, taking care with independence
-to ensure the approximating processes are jointly Gaussian.
-
-\begin{theorem}[Strong approximation of $\hat f_W$]
-  \label{thm:kernel_app_strong_approx_fW}
-
-  Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
-  hold. For each $n \geq 2$ and any sequence $R_n \to \infty$ there exists on
-  some probability space a centered Gaussian process $Z_n^{f\prime}$ and a copy
-  of $\hat f_W$, denoted $\hat f_W'$, satisfying
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \Big|
-    \hat f_W'(w) - \E[\hat f_W'(w)]
-    - Z_n^{f\prime}(w)
-    \Big| \\
-    &\quad\lesssim_\P
-    n^{-1} \log n
-    + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
-    + n^{-7/6} h^{-1/2} (\log n)^{2/3}.
-  \end{align*}
-  %
-  Further, $Z_n^{f\prime}$ has the same covariance
-  structure as
-  $\hat f_W'(w)$
-  in the sense that for all
-  $w, w' \in \cW$,
-  %
-  \begin{align*}
-    \E\big[Z_n^{f\prime}(w) Z_n^{f\prime}(w')\big]
-    &=
-    \Cov\Big[
-      \hat f_W'(w),
-      \hat f_W'(w')
-    \Big]
-    = \Sigma_n(w,w').
-  \end{align*}
-  %
-  It has continuous trajectories satisfying the following regularity property
-  for any $\delta_n \in (0, 1/2]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \Big|
-      Z_n^{f\prime}(w)
-      - Z_n^{f\prime}(w')
-      \Big|
-    \right]
-    &\lesssim
-    \frac{\Du}{\sqrt n} \delta_n
-    \sqrt{\log \frac{1}{\delta_n}}
-    + \frac{1}{\sqrt{n^2h}}
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h\delta_n}}.
-  \end{align*}
-  %
-\end{theorem}
-
-The main result Theorem~\ref{thm:kernel_strong_approx_Tn}
-now follows easily using Theorem~\ref{thm:kernel_app_strong_approx_fW},
-the bias bound from Theorem~\ref{thm:kernel_bias},
-and properties of $\Sigma_n$ established in
-Lemma~\ref{lem:kernel_variance_bounds}.
-
-\subsection{Covariance estimation}
-\label{sec:kernel_app_covariance_estimation}
-
-In this section we carefully construct a consistent estimator for the
-covariance function $\Sigma_n$. Firstly, we characterize $\Sigma_n$ in
-Lemma~\ref{lem:kernel_app_covariance_structure}. In
-Lemma~\ref{lem:kernel_app_covariance_estimation}
-we define the estimator and demonstrate that it converges in probability in a
-suitable sense. In Lemma~\ref{lem:kernel_app_alternative_covariance_estimator}
-we give an
-alternative representation which is more amenable to computation.
-
-\begin{lemma}[Covariance structure]
-  \label{lem:kernel_app_covariance_structure}
-
-  Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
-  hold. Then $\Sigma_n$, as defined in Section~\ref{sec:kernel_degeneracy},
-  admits the following representations,
-  where $1 \leq i < j < r \leq n$.
-  %
-  \begin{align*}
-    \Sigma_n(w,w')
-    &=
-    \frac{2}{n(n-1)}
-    \,\Cov\!\big[
-      k_h(W_{i j},w),
-      k_h(W_{i j},w')
-    \big]
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \,\Cov\!\big[
-      k_h(W_{i j},w),
-      k_h(W_{i r},w')
-    \big] \\
-    &=
-    \frac{2}{n(n-1)}
-    \,\Cov\!\big[
-      k_h(W_{i j},w),
-      k_h(W_{i j},w')
-    \big] \\
-    &\quad+
-    \frac{4(n-2)}{n(n-1)}
-    \,\Cov\!\big[
-      \E[k_h(W_{i j},w) \mid A_i],
-      \E[k_h(W_{i j},w') \mid A_i]
-    \big],
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{lemma}[Covariance estimation]
-  \label{lem:kernel_app_covariance_estimation}
-
-  Grant Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth},
-  and suppose $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define
-  %
-  \begin{align*}
-    S_{i j r}(w,w')
-    &=
-    \frac{1}{6}
-    \Big(
-      k_h(W_{i j},w)
-      k_h(W_{i r},w')
-      + k_h(W_{i j},w)
-      k_h(W_{jr},w')
-      + k_h(W_{i r},w)
-      k_h(W_{i j},w') \\
-      &\quad+
-      k_h(W_{i r},w)
-      k_h(W_{jr},w')
-      + k_h(W_{jr},w)
-      k_h(W_{i j},w')
-      + k_h(W_{jr},w)
-      k_h(W_{i r},w')
-    \Big), \\
-    \hat \Sigma_n(w,w')
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i<j}
-    k_h(W_{i j},w)
-    k_h(W_{i j},w')
-    +
-    \frac{24}{n^2(n-1)^2}
-    \sum_{i<j<r}
-    S_{i j r}(w,w') \\
-    &\quad-
-    \frac{4n-6}{n(n-1)}
-    \hat f_W(w)
-    \hat f_W(w').
-  \end{align*}
-  %
-  Then $\hat \Sigma_n$
-  is uniformly entrywise-consistent in the sense that
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    &\lesssim_\P
-    \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Alternative covariance estimator representation]
-  \label{lem:kernel_app_alternative_covariance_estimator}
-
-  Suppose that Assumptions~\ref{ass:kernel_data}
-  and~\ref{ass:kernel_bandwidth} hold,
-  and let $\hat \Sigma_n$
-  be the covariance estimator defined
-  in Lemma~\ref{lem:kernel_app_covariance_estimation}.
-  Then the following alternative representation
-  for $\hat \Sigma_n$ holds,
-  which may be easier to compute
-  as it does not involve any triple summations
-  over the data.
-  Let $S_i(w) = \frac{1}{n-1}
-  \sum_{j = 1}^{i-1} k_h(W_{j i}, w)
-  + \frac{1}{n-1} \sum_{j = i+1}^n k_h(W_{i j}, w)$
-  estimate $\E[k_h(W_{i j},w) \mid A_i]$.
-  %
-  \begin{align*}
-    \hat \Sigma_n(w,w')
-    &=
-    \frac{4}{n^2}
-    \sum_{i=1}^n
-    S_i(w) S_i(w')
-    - \frac{4}{n^2(n-1)^2}
-    \sum_{i<j}
-    k_h(W_{i j},w)
-    k_h(W_{i j},w') \\
-    &\quad-
-    \frac{4n-6}{n(n-1)}
-    \hat f_W(w)
-    \hat f_W(w').
-  \end{align*}
-  %
-\end{lemma}
-
-We show how to obtain a positive semi-definite estimator $\hat \Sigma_n^+$
-which is uniformly entrywise-consistent for $\Sigma_n$. Define $\hat \Sigma_n$
-as in Lemma~\ref{lem:kernel_app_covariance_estimation} and consider the
-following
-optimization problem over bivariate functions.
-%
-\begin{equation}
-  \label{eq:kernel_app_sdp}
-  %
-  \begin{aligned}
-    \minimize
-    \qquad
-    & \sup_{w,w' \in \cW}
-    \left|
-    \frac{M(w,w') - \hat\Sigma_n(w,w')}
-    {\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
-    \right|
-    \quad \textup{ over } M: \cW \times \cW \to \R
-    \\
-    \subjectto
-    \qquad
-    & M \textup{ is symmetric and positive semi-definite}, \\
-    & \big|M(w,w') - M(w, w'')\big|
-    \leq \frac{4}{n h^3}
-    C_\rk C_\rL
-    |w'-w''|
-    \textup{ for all }
-    w, w', w'' \in \cW.
-  \end{aligned}
-  %
-\end{equation}
-
-\begin{lemma}[Consistency of $\hat \Sigma_n^+$]
-  \label{lem:kernel_app_sdp}
-
-  Suppose that Assumptions~\ref{ass:kernel_data}
-  and~\ref{ass:kernel_bandwidth} hold, and that
-  $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
-  Then the optimization problem \eqref{eq:kernel_app_sdp}
-  has an approximately optimal solution $\hat\Sigma_n^+$
-  which is uniformly entrywise-consistent
-  for $\Sigma_n$ in the sense that
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    &\lesssim_\P
-    \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-
-\end{lemma}
-
-The optimization problem \eqref{eq:kernel_app_sdp} is stated for functions
-rather than
-matrices so is infinite-dimensional. However, when restricting to finite-size
-matrices, Lemma~\ref{lem:kernel_app_sdp} still holds and does not depend on the
-size
-of the matrices. Furthermore, the problem then becomes a semi-definite program
-and so can be solved to arbitrary precision in polynomial time in the size of
-the matrices \citep{laurent2005semidefinite}.
-
-The Lipschitz-type constraint in the optimization problem
-\eqref{eq:kernel_app_sdp}
-ensures that $\hat \Sigma_n^+$ is sufficiently smooth and is a technicality
-required by some of the later proofs. In practice this constraint is readily
-verified.
-
-\begin{lemma}[Positive semi-definite variance estimator bounds]
-  \label{lem:kernel_app_variance_estimator_bounds}
-
-  Suppose that Assumptions~\ref{ass:kernel_data}
-  and~\ref{ass:kernel_bandwidth} hold, and that
-  $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
-  Then $\hat \Sigma_n^+(w,w) \geq 0$
-  almost surely for all $w \in \cW$ and
-  %
-  \begin{align*}
-    \frac{\Dl^2}{n} + \frac{1}{n^2h}
-    &\lesssim_\P
-    \inf_{w \in \cW} \hat \Sigma_n^+(w,w)
-    \leq
-    \sup_{w \in \cW} \hat \Sigma_n^+(w,w)
-    \lesssim_\P
-    \frac{\Du^2}{n} + \frac{1}{n^2h}.
-  \end{align*}
-
-\end{lemma}
-
-\subsection{Feasible uniform confidence bands}
-
-We use the strong approximation derived in
-Section~\ref{sec:kernel_app_strong_approx} and the
-positive semi-definite covariance estimator introduced in
-Section~\ref{sec:kernel_app_covariance_estimation} to construct feasible
-uniform
-confidence bands. We drop the prime notation for copies of processes
-in the interest of clarity.
-
-\begin{lemma}[Proximity of the standardized and studentized $t$-statistics]
-  \label{lem:kernel_app_studentized_t_statistic}
-
-  Let Assumptions \ref{ass:kernel_data} and
-  \ref{ass:kernel_bandwidth} hold, and suppose that
-  $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
-  Define for $w \in \cW$
-  the Studentized $t$-statistic process
-  %
-  \begin{align*}
-    \hat T_n(w) = \frac{\hat f_W(w) - f_W(w)}
-    {\sqrt{\hat\Sigma_n^+(w,w)}}.
-  \end{align*}
-  %
-  Then
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \hat T_n(w) - T_n(w) \right|
-    &\lesssim_\P
-    \sqrt{\frac{\log n}{n}}
-    \left(
-      \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
-      {\Dl + 1/\sqrt{n h}}
-    \right)
-    \frac{1}{\Dl + 1/\sqrt{n h}}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Feasible Gaussian approximation
-  of the infeasible Gaussian process]
-  \label{lem:kernel_app_distributional_approx_feasible_gaussian}
-
-  Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
-  hold, and suppose that
-  $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
-  Define a process $\hat Z_n^T(w)$ which,
-  conditional on the data $\bW_n$,
-  is conditionally mean-zero and
-  conditionally Gaussian, and whose
-  conditional covariance structure is
-  %
-  \begin{align*}
-    \E\left[
-      \hat Z_n^T(w) \hat Z_n^T(w')
-    \bigm| \bW_n \right]
-    &=
-    \frac{\hat \Sigma_n^+(w,w')}
-    {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-  \end{align*}
-  %
-  Then the following conditional
-  Kolmogorov--Smirnov result holds.
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \left| Z_n^T(w) \right|
-      \leq t
-    \right)
-    - \P\left(
-      \sup_{w \in \cW}
-      \left| \hat Z_n^T(w) \right|
-      \leq t
-      \biggm\vert \bW_n
-    \right)
-    \right|
-    &\lesssim_\P
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Feasible Gaussian approximation of the studentized $t$-statistic]
-  \label{lem:kernel_app_feasible_gaussian_approx}
-
-  Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}
-  and \ref{ass:kernel_rates} hold, and suppose that $f_W(w) > 0$ on $\cW$.
-  Then
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    - \P\left(
-      \sup_{w \in \cW}
-      \left| \hat Z_n^T(w) \right|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \right|
-    &\ll_\P
-    1.
-  \end{align*}
-
-\end{lemma}
-
-These intermediate lemmas can be used to establish the valid and feasible
-uniform confidence bands presented in Theorem~\ref{thm:kernel_ucb} in the main
-text. See Section~\ref{sec:kernel_app_proofs} for details.
-
-\subsection{Counterfactual dyadic density estimation}
-
-In this section we give a detailed analysis of the counterfactual
-estimator of Section~\ref{sec:kernel_counterfactual}.
-We begin with an assumption describing the counterfactual setup.
-
-\begin{assumption}[Counterfactual data generation]
-  \label{ass:kernel_app_counterfactual}
-
-  For each $r \in \{0,1\}$,
-  let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be as in
-  Assumption~\ref{ass:kernel_data}.
-  Let $X_i^r$ be finitely-supported variables,
-  setting $\bX_n^r = (X_1^r, \ldots, X_n^r)$.
-  Suppose that $(A_i^r, X_i^r)$ are
-  independent over $1 \leq i \leq n$
-  and that $\bX_n^r$ is independent of $\bV_n^r$.
-  Assume that $W_{i j}^r \mid X_i^r, X_j^r$
-  has a Lebesgue density
-  $f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$
-  and that $X_i^r$ has positive
-  probability mass function
-  $p_X^r(x)$ on a common support $\cX$.
-  Suppose that
-  $(\bA_n^0, \bV_n^0, \bX_n^0)$
-  and $(\bA_n^1, \bV_n^1, \bX_n^1)$
-  are independent.
-
-\end{assumption}
-
-The counterfactual density of $W_{i j}$ in population $1$ had $X_i, X_j$
-followed population $0$ is
-%
-\begin{align*}
-  f_W^{1 \triangleright 0}(w)
-  &=
-  \E\left[
-    f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big)
-  \right]
-  = \sum_{x_1 \in \cX}
-  \sum_{x_2 \in \cX}
-  f_{W \mid XX}^{1}(w \mid x_1, x_2)
-  \psi(x_1)
-  \psi(x_2)
-  p_X^{1}(x_1)
-  p_X^{1}(x_2),
-\end{align*}
-%
-with $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$.
-Define the counterfactual dyadic kernel density estimator
-%
-\begin{align*}
-  \hat f_W^{1 \triangleright 0}(w)
-  &=
-  \frac{2}{n(n-1)}
-  \sum_{i=1}^{n-1}
-  \sum_{j=i+1}^n
-  \hat \psi(X_i^1)
-  \hat \psi(X_j^1)
-  k_h(W_{i j}^1, w),
-\end{align*}
-%
-where
-$\hat\psi(x) = \I\{\hat p_X^{1}(x) > 0\}\hat p_X^{0}(x) / \hat p_X^{1}(x)$
-and $\hat p_X^{r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$.
-Since $p_X^r(x) > 0$,
-%
-\begin{align*}
-  \hat\psi(x) - \psi(x)
-  &=
-  \frac{\hat p_X^{0}(x) - p_X^0(x)}{p_X^1(x)}
-  - \frac{p_X^0(x)}{p_X^1(x)}
-  \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \\
-  &\quad+
-  \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)}
-  \frac{\hat p_X^{1}(x) p_X^0(x) - \hat p_X^{0}(x)p_X^1(x)}
-  {\hat p_X^{1}(x) p_X^1(x)} \\
-  &=
-  \frac{1}{n}
-  \sum_{r=1}^n \kappa(X_r^0, X_r^1, x)
-  + O_\P\left(\frac{1}{n}\right)
-\end{align*}
-%
-is an asymptotic linear representation where
-%
-\begin{align*}
-  \kappa(X_i^0, X_i^1, x)
-  &=
-  \frac{\I\{X_i^0 = x\} - p_X^0(x)}{p_X^1(x)}
-  - \frac{p_X^0(x)}{p_X^1(x)}
-  \frac{\I\{X_i^1 = x\} - p_X^1(x)}{p_X^1(x)}
-\end{align*}
-%
-satisfies
-$\E[\kappa(X_i^0, X_i^1, x)] = 0$.
-We now establish uniform consistency and feasible strong
-approximation results for the counterfactual density estimator.
-
-\begin{lemma}[Bias of $\hat f_W^{1 \triangleright 0}$]
-  \label{lem:kernel_app_counterfactual_bias}
-
-  Suppose that Assumptions~\ref{ass:kernel_data},
-  \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold.
-  Then
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \big|
-    \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
-    - f_W^{1 \triangleright 0}(w)
-    \big|
-    \lesssim
-    h^{p \wedge \beta} + \frac{1}{n}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Hoeffding-type decomposition for
-  $\hat f_W^{1 \triangleright 0}$]
-  \label{lem:kernel_app_counterfactual_hoeffding}
-
-  Suppose that Assumptions~\ref{ass:kernel_data},
-  \ref{ass:kernel_bandwidth}, and
-  \ref{ass:kernel_app_counterfactual} hold.
-  With $k_{i j} = k_h(W_{i j}^1, w)$,
-  $\kappa_{r i} = \kappa(X_r^0, X_r^1, X_i^1)$
-  and $\psi_i = \psi(X_i^1)$, define the projections
-  %
-  \begin{align*}
-    u
-    &=
-    \E\left[
-      k_{i j}
-      \psi_i
-      \psi_j
-    \right], \\
-    u_i
-    &=
-    \frac{2}{3} \psi_i
-    \E\left[
-      k_{i j}
-      \psi_j
-    \mid A_i^1 \right]
-    +
-    \frac{2}{3} \E\left[
-      k_{jr}
-      \psi_j \kappa_{i r}
-    \mid X_i^0, X_i^1 \right]
-    - \frac{2}{3} u, \\
-    u_{i j}
-    &=
-    \frac{1}{3}
-    \psi_i
-    \psi_j
-    \E\left[
-      k_{i j}
-    \mid A_i^1, A_j^1 \right]
-    +
-    \frac{1}{3}
-    \psi_i
-    \E\left[
-      k_{i r} \psi_r
-    \mid A_i^1 \right]
-    +
-    \frac{1}{3}
-    \psi_i
-    \E\left[
-      k_{i r} \kappa_{jr}
-    \mid A_i^1, X_j^0, X_j^1 \right] \\
-    &\quad+
-    \frac{1}{3}
-    \kappa_{j i}
-    \E\left[
-      k_{i r} \psi_r
-    \mid A_i^1 \right]
-    + \frac{1}{3}
-    \psi_j
-    \E\left[
-      k_{jr} \psi_r
-    \mid A_j^1 \right]
-    +
-    \frac{1}{3}
-    \psi_j
-    \E\left[
-      k_{jr} \kappa_{i r}
-    \mid X_i^0, X_i^1, A_j^1 \right] \\
-    &\quad+
-    \frac{1}{3}
-    \kappa_{i j}
-    \E\left[
-      k_{jr} \psi_r
-    \mid A_j^1 \right]
-    - u_i - u_j + u, \\
-    u_{i j r}
-    &=
-    \frac{1}{3}
-    \psi_i \psi_j
-    \E\left[
-      k_{i j}
-    \mid A_i^1, A_j^1 \right]
-    +
-    \frac{1}{3}
-    \psi_i \kappa_{r j}
-    \E\left[
-      k_{i j}
-    \mid A_i^1, A_j^1 \right]
-    +
-    \frac{1}{3}
-    \psi_j \kappa_{r i}
-    \E\left[
-      k_{i j}
-    \mid A_i^1, A_j^1 \right] \\
-    &\quad+
-    \frac{1}{3}
-    \psi_i \psi_r
-    \E\left[
-      k_{i r}
-    \mid A_i^1, A_r^1 \right]
-    + \frac{1}{3}
-    \psi_i \kappa_{jr}
-    \E\left[
-      k_{i r}
-    \mid A_i^1, A_r^1 \right]
-    +
-    \frac{1}{3}
-    \psi_r \kappa_{j i}
-    \E\left[
-      k_{i r}
-    \mid A_i^1, A_r^1 \right] \\
-    &\quad+
-    \frac{1}{3}
-    \psi_j \psi_r
-    \E\left[
-      k_{jr}
-    \mid A_j^1, A_r^1 \right]
-    + \frac{1}{3}
-    \psi_j \kappa_{i r}
-    \E\left[
-      k_{jr}
-    \mid A_j^1, A_r^1 \right]
-    +
-    \frac{1}{3}
-    \psi_r \kappa_{i j}
-    \E\left[
-      k_{jr}
-    \mid A_j^1, A_r^1 \right] \\
-    &\quad-
-    u_{i j} - u_{i r} - u_{jr}
-    + u_i + u_j + u_r
-    - u, \\
-    v_{i j r}
-    &=
-    \frac{1}{3}
-    k_{i j} \big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \big)
-    + \frac{1}{3}
-    k_{i r} \big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \big) \\
-    &\quad+
-    \frac{1}{3}
-    k_{jr} \big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \big).
-  \end{align*}
-  %
-  With $l_i^{1 \triangleright 0}(w) = u_i$
-  and $e_{i j r}^{1 \triangleright 0}(w) = v_{i j r} - u_{i j r}$,
-  set
-  %
-  \begin{align*}
-    L_n^{1 \triangleright 0}(w)
-    &=
-    \frac{3}{n} \sum_{i=1}^n
-    l_i^{1 \triangleright 0}(w)
-    &\text{and} &
-    &E_n^{1 \triangleright 0}(w)
-    &=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i=1}^{n-2}
-    \sum_{j=i+1}^{n-1}
-    \sum_{r=i+1}^n
-    e_{i j r}^{1 \triangleright 0}(w).
-  \end{align*}
-  %
-  Then the following Hoeffding-type decomposition holds,
-  where $O_\P(1/n)$ is uniform in $w \in \cW$.
-  %
-  \begin{align*}
-    \hat f_W^{1 \triangleright 0}(w)
-    = \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
-    + L_n^{1 \triangleright 0}(w)
-    + E_n^{1 \triangleright 0}(w)
-    + O_\P\left( \frac{1}{n} \right).
-  \end{align*}
-  %
-  Further,
-  the stochastic processes
-  $L_n^{1 \triangleright 0}$
-  and $E_n^{1 \triangleright 0}$
-  are mean-zero and orthogonal
-  in $L^2(\P)$.
-  Define the upper and lower degeneracy constants as
-  %
-  \begin{align*}
-    \Du^{1 \triangleright 0}
-    &=
-    \limsup_{n \to \infty}
-    \sup_{w \in \cW}
-    \Var\big[
-      l_i^{1 \triangleright 0}(w)
-    \big]^{1/2}
-    &\text{and}&
-    &
-    \Dl^{1 \triangleright 0}
-    &=
-    \liminf_{n \to \infty}
-    \inf_{w \in \cW}
-    \Var\big[
-      l_i^{1 \triangleright 0}(w)
-    \big]^{1/2}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Uniform consistency of $\hat f_W^{1 \triangleright 0}$]
-  \label{lem:kernel_app_counterfactual_uniform_consistency}
-
-  Suppose that Assumptions~\ref{ass:kernel_data},
-  \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold.
-  Then
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W^{1 \triangleright 0}(w)
-      - f_W^{1 \triangleright 0}(w)
-    \right]
-    &\lesssim
-    h^{p \wedge \beta}
-    + \frac{\Du^{1 \triangleright 0}}{\sqrt n}
-    + \sqrt{\frac{\log n}{n^2h}}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Strong approximation of $\hat f_W^{1 \triangleright 0}$]
-  \label{lem:kernel_app_counterfactual_sa}
-
-  On an appropriately enlarged probability space
-  and for any sequence $R_n \to \infty$,
-  there exists a mean-zero Gaussian process
-  $Z_n^{f, 1 \triangleright 0}$
-  with the same covariance structure as
-  $\hat f_W^{1 \triangleright 0}(w)$ satisfying
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left|
-    \hat f_W^{1 \triangleright 0}(w)
-    - \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
-    - Z_n^{f, 1 \triangleright 0}(w)
-    \right| \\
-    &\quad\lesssim_\P
-    n^{-1} \log n
-    + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
-    + n^{-7/6} h^{-1/2} (\log n)^{2/3}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Counterfactual covariance structure]
-  \label{lem:kernel_app_counterfactual_covariance_structure}
-
-  Writing $k_{i j}'$ for $k_h(W_{i j}^1, w')$ etc.,
-  the counterfactual covariance function is
-  %
-  \begin{align*}
-    &\Sigma_n^{1 \triangleright 0}(w,w')
-    = \Cov\left[
-      \hat f_W^{1 \triangleright 0}(w),
-      \hat f_W^{1 \triangleright 0}(w')
-    \right] \\
-    &\quad=
-    \frac{4}{n}
-    \E\left[
-      \Big(
-        \psi_i
-        \E\big[
-          k_{i j} \psi_j
-          \mid A_i^1
-        \big]
-        + \E\left[
-          k_{r j} \psi_r
-          \kappa_{i j}
-          \mid X_i^0, X_i^1
-        \right]
-      \Big)
-      \right. \\
-      &\left.
-      \qquad\qquad\quad
-      \times
-      \Big(
-        \psi_i
-        \E\big[
-          k_{i j}' \psi_j
-          \mid A_i^1
-        \big]
-        + \E\left[
-          k_{r j}' \psi_r \kappa_{i j}
-          \mid X_i^0, X_i^1
-        \right]
-      \Big)
-    \right] \\
-    &\qquad+
-    \frac{2}{n^2}
-    \E\left[
-      k_{i j} k_{i j}'
-      \psi_i^2 \psi_j^2
-    \right]
-    - \frac{4}{n}
-    \E\left[
-      k_{i j} \psi_i \psi_j
-    \right]
-    \E\left[
-      k_{i j}' \psi_i \psi_j
-    \right]
-    + O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right).
-  \end{align*}
-
-\end{lemma}
-
-\begin{lemma}[Gaussian approximation
-  of the standardized counterfactual $t$-statistic]
-  \label{lem:kernel_app_counterfactual_infeasible_t_statistic}
-
-  Let Assumptions \ref{ass:kernel_data},
-  \ref{ass:kernel_bandwidth}, and
-  \ref{ass:kernel_app_counterfactual}
-  hold, and suppose
-  $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$.
-  Define
-  %
-  \begin{align*}
-    T_n^{1 \triangleright 0}(w)
-    &= \frac{\hat f_W^{1 \triangleright 0}(w)
-    - f_W^{1 \triangleright 0}(w)}
-    {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}
-    \quad\text{and}\quad
-    Z_n^{T, 1 \triangleright 0}(w)
-    = \frac{Z_n^{f, 1 \triangleright 0}(w)}
-    {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}.
-  \end{align*}
-  %
-  Then with $R_n \to \infty$ as in Lemma~\ref{lem:kernel_app_counterfactual_sa},
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left|
-    T_n^{1 \triangleright 0}(w) - Z_n^{T, 1 \triangleright 0}(w)
-    \right| \\
-    &\quad\lesssim_\P
-    \frac{
-      n^{-1/2} \log n
-      + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
-      + n^{-2/3} h^{-1/2} (\log n)^{2/3}
-    + n^{1/2} h^{p \wedge \beta}}
-    {\Dl^{1 \triangleright 0} + 1/\sqrt{n h}}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{theorem}[Infeasible counterfactual uniform confidence bands]
-  \label{thm:kernel_app_counterfactual_infeasible_ucb}
-
-  Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth},
-  \ref{ass:kernel_rates}, and \ref{ass:kernel_app_counterfactual}
-  hold and suppose that $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$.
-  Let $\alpha \in (0,1)$ be a confidence level
-  and define $q^{1 \triangleright 0}_{1-\alpha}$ as the quantile
-  satisfying
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \left| Z_n^{T,1 \triangleright 0}(w) \right|
-      \leq q^{1 \triangleright 0}_{1-\alpha}
-    \right)
-    &=
-    1 - \alpha.
-  \end{align*}
-  %
-  Then
-  %
-  \begin{align*}
-    \P\left(
-      f_W^{1 \triangleright 0}(w)
-      \in
-      \left[
-        \hat f_W^{1 \triangleright 0}(w)
-        \pm
-        q^{1 \triangleright 0}_{1-\alpha}
-        \sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}
-      \, \right]
-      \, \textup{for all }
-      w \in \cW
-    \right)
-    \to 1 - \alpha.
-  \end{align*}
-\end{theorem}
-%
-We propose an estimator for the counterfactual covariance function
-$\Sigma_n^{1 \triangleright 0}$. First let
-%
-\begin{align*}
-  \hat\kappa(X_i^0, X_i^1, x)
-  &=
-  \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)}
-  - \frac{\hat p_X^0(x)}{\hat p_X^1(x)}
-  \frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)},
-\end{align*}
-%
-and define the leave-out conditional expectation estimators
-%
-\begin{align*}
-  S_i^{1 \triangleright 0}(w)
-  &=
-  \hat\E\left[
-    k_h(W_{i j}^1,w) \psi(X_j^1) \mid A_i^1
-  \right] \\
-  &=
-  \frac{1}{n-1}
-  \left(
-    \sum_{j=1}^{i-1}
-    k_h(W_{j i}^1,w) \hat\psi(X_j^1)
-    + \sum_{j=i+1}^n
-    k_h(W_{i j}^1,w) \hat\psi(X_j^1)
-  \right), \\
-  \tilde S_i^{1 \triangleright 0}(w)
-  &=
-  \hat\E\left[
-    k_h(W_{r j}^1,w) \psi(X_r^1)
-    \kappa(X_i^0, X_i^1, X_j^1) \mid X_i^0, X_i^1
-  \right] \\
-  &=
-  \frac{1}{n-1}
-  \sum_{j=1}^n
-  \I\{j \neq i\}
-  \hat\kappa(X_i^0, X_i^1, X_j^1)
-  S_j^{1 \triangleright 0}(w).
-\end{align*}
-%
-Then set
-%
-\begin{align*}
-  \hat\Sigma_n^{1 \triangleright 0}(w,w')
-  &=
-  \frac{4}{n^2}
-  \sum_{i=1}^n
-  \left(
-    \hat\psi(X_i^1)
-    S_i^{1 \triangleright 0}(w)
-    + \tilde S_i^{1 \triangleright 0}(w)
-  \right)
-  \left(
-    \hat\psi(X_i^1)
-    S_i^{1 \triangleright 0}(w')
-    + \tilde S_i^{1 \triangleright 0}(w')
-  \right) \\
-  &\quad-
-  \frac{4}{n^3(n-1)}
-  \sum_{i<j}
-  k_h(W_{i j}^1, w)
-  k_h(W_{i j}^1, w')
-  \hat\psi(X_i^1)^2
-  \hat\psi(X_j^1)^2
-  - \frac{4}{n}
-  \hat f_W^{1 \triangleright 0}(w)
-  \hat f_W^{1 \triangleright 0}(w').
-\end{align*}
-%
-We use a positive semi-definite approximation to
-$\hat\Sigma_n^{1 \triangleright 0}$, denoted by
-$\hat\Sigma_n^{+, 1 \triangleright 0}$,
-and omit the proof of consistency for brevity.
-To construct feasible uniform confidence bands,
-define a process $\hat Z_n^{T, 1 \triangleright 0}(w)$ which,
-conditional on the data $\bW_n^1$, $\bX_n^0$, and $\bX_n^1$
-is conditionally mean-zero and conditionally Gaussian, and whose
-conditional covariance structure is
-%
-\begin{align*}
-  \E\left[
-    \hat Z_n^{T, 1 \triangleright 0}(w)
-    \hat Z_n^{T, 1 \triangleright 0}(w')
-  \bigm| \bW_n^1, \bX_n^0, \bX_n^1 \right]
-  &=
-  \frac{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w')}
-  {\sqrt{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w)
-  \hat \Sigma_n^{+, 1 \triangleright 0}(w',w')}}.
-\end{align*}
-%
-Let $\alpha \in (0,1)$ be a confidence level and define
-$\hat q_{1-\alpha}^{1 \triangleright 0}$
-as the conditional quantile satisfying
-%
-\begin{align*}
-  \P\left(
-    \sup_{w \in \cW}
-    \left| \hat Z_n^{T, 1 \triangleright 0}(w) \right|
-    \leq \hat q_{1-\alpha}^{1 \triangleright 0}
-    \Bigm\vert \bW_n^1, \bX_n^0, \bX_n^1
-  \right)
-  &=
-  1 - \alpha.
-\end{align*}
-%
-Then assuming that the covariance estimator is appropriately consistent,
-we have that
-%
-\begin{align*}
-  \P\left(
-    f_W^{1 \triangleright 0}(w)
-    \in
-    \left[
-      \hat f_W^{1 \triangleright 0}(w)
-      \pm
-      \hat q^{1 \triangleright 0}_{1-\alpha}
-      \sqrt{\hat\Sigma_n^{+, 1 \triangleright 0}(w,w)}
-    \,\right]
-    \,\textup{for all }
-    w \in \cW
-  \right)
-  \to 1 - \alpha.
-\end{align*}
-
-\section{Technical lemmas}
-\label{sec:kernel_app_technical}
-
-We present some lemmas which provide the technical foundations for our main
-results. These lemmas are stated in as much generality as is reasonable,
-and may be of independent interest.
-
-\subsection{Maximal inequalities for i.n.i.d.\ empirical processes}
-
-Firstly, we provide a maximal inequality
-for empirical processes of
-independent but not necessarily identically distributed
-(i.n.i.d.)
-random variables,
-indexed by a class of functions.
-This result is an extension
-of Theorem~5.2 from \citet{chernozhukov2014gaussian},
-which only covers i.i.d.\ random variables,
-and is proven in the same manner.
-Such a result is useful in the study of dyadic data
-because when conditioning on latent variables,
-we may encounter
-random variables which are conditionally independent
-but which do not necessarily follow the same
-conditional distribution.
-
-\begin{lemma}[A maximal inequality for i.n.i.d.\ empirical processes]
-  \label{lem:kernel_app_maximal_entropy}
-
-  Let $X_1, \dots, X_n$
-  be independent but not necessarily identically distributed
-  (i.n.i.d.)
-  random variables taking values in a
-  measurable space $(S,\cS)$.
-  Denote the joint distribution of
-  $X_1,\ldots,X_n$ by $\P$
-  and the marginal distribution of
-  $X_i$ by $\P_i$,
-  and let $\bar\P = n^{-1} \sum_i \P_i$.
-  Let $\cF$ be a class of Borel measurable functions
-  from $S$ to $\R$
-  which is pointwise measurable
-  (i.e.\ it contains a countable subclass which
-  is dense under pointwise convergence).
-  Let $F$ be a strictly positive
-  measurable envelope function for $\cF$
-  (i.e.\ $|f(s)| \leq |F(s)|$ for all $f \in \cF$
-  and $s \in S$).
-  For a distribution $\Q$ and some $q \geq 1$,
-  define the $(\Q,q)$-norm of $f \in \cF$ as
-  $\|f\|_{\Q,q}^q = \E_{X \sim \Q}[f(X)^q]$
-  and suppose
-  $\|F\|_{\bar\P,2} < \infty$.
-  For $f \in \cF$
-  define the empirical process
-  %
-  \begin{align*}
-    G_n(f)
-    &=
-    \frac{1}{\sqrt n}
-    \sum_{i=1}^n
-    \big(
-      f(X_i) - \E[f(X_i)]
-    \big).
-  \end{align*}
-  %
-  Let $\sigma > 0$ satisfy
-  $\sup_{f \in \cF}
-  \|f\|_{\bar\P,2}
-  \leq
-  \sigma
-  \leq
-  \|F\|_{\bar\P,2}$
-  and
-  $M = \max_{1 \leq i \leq n} F(X_i)$.
-  Then with
-  $\delta = \sigma / \|F\|_{\bar\P,2} \in (0,1]$,
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{f \in \cF}
-      \big| G_n(f) \big|
-    \right]
-    &\lesssim
-    \|F\|_{\bar\P,2}
-    \, J\big(\delta, \cF, F \big)
-    +
-    \frac{\|M\|_{\P,2} \, J(\delta, \cF, F)^2}{\delta^2 \sqrt{n}},
-  \end{align*}
-  %
-  where $\lesssim$ is up to a universal constant,
-  and $J(\delta, \cF, F)$ is the covering integral
-  %
-  \begin{align*}
-    J\big(\delta, \cF, F\big)
-    &=
-    \int_0^\delta
-    \sqrt{1 +
-    \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})}
-    \diff{\varepsilon},
-  \end{align*}
-  %
-  with the supremum taken over finite discrete probability
-  measures $\Q$ on $(S, \cS)$.
-
-\end{lemma}
-
-\begin{lemma}[A VC class maximal inequality for i.n.i.d.\ empirical processes]
-  \label{lem:kernel_app_maximal_vc_inid}
-
-  Assume the same setup as in
-  Lemma~\ref{lem:kernel_app_maximal_entropy},
-  and suppose that $\cF$ forms a VC-type class
-  in that
-  %
-  \begin{align*}
-    \sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})
-    &\leq
-    (C_1/\varepsilon)^{C_2}
-  \end{align*}
-  %
-  for all $\varepsilon \in (0,1]$,
-  for some constants
-  $C_1 \geq e$
-  (where $e$ is the standard exponential constant)
-  and $C_2 \geq 1$.
-  Then for $\delta \in (0,1]$
-  we have the covering integral bound
-  %
-  $J\big(\delta, \cF, F\big) \leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}$,
-  %
-  and so by Lemma~\ref{lem:kernel_app_maximal_entropy},
-  up to a universal constant,
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{f \in \cF}
-      \big| G_n(f) \big|
-    \right]
-    &\lesssim
-    \sigma
-    \sqrt{C_2 \log (C_1/\delta)}
-    +
-    \frac{\|M\|_{\P,2} C_2 \log(C_1/\delta)}{\sqrt{n}} \\
-    &\lesssim
-    \sigma
-    \sqrt{C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)}
-    +
-    \frac{\|M\|_{\P,2} C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)}
-    {\sqrt{n}}.
-  \end{align*}
-  %
-\end{lemma}
-
-\subsection{Strong approximation results}
-
-Next we provide two strong approximation results.
-The first is a corollary of the KMT approximation
-\citep{komlos1975approximation}
-which applies to bounded-variation functions
-of i.i.d.\ variables.
-The second is an extension of the Yurinskii coupling
-\citep{belloni2019conditional}
-which applies to Lipschitz functions
-of i.n.i.d.\ variables.
-
-\begin{lemma}[A KMT approximation corollary]
-  \label{lem:kernel_app_kmt_corollary}
-
-  For $n \geq 1$
-  let $X_1, \ldots, X_n$
-  be i.i.d.\ real-valued random variables and
-  $g_n: \R \times \R \to \R$
-  be a function satisfying
-  the total variation bound
-  $\sup_{x \in \R} \|g_n(\cdot, x)\|_\TV < \infty$.
-  Then on some probability space
-  there exist independent copies of
-  $X_1, \ldots, X_n$,
-  denoted
-  $X_1', \ldots, X_n'$,
-  and a mean-zero Gaussian process $Z_n(x)$
-  such that if we define
-  the empirical process
-  %
-  \begin{align*}
-    G_n(x)
-    = \frac{1}{\sqrt n} \sum_{i=1}^n
-    \Big(g_n(X_i',x) - \E\big[g_n(X_i',x)\big]\Big),
-  \end{align*}
-  %
-  then
-  for some universal positive constants
-  $C_1$, $C_2$, and $C_3$,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{x \in \R}
-      \big|G_n(x) - Z_n(x)\big|
-      > \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV
-      \, \frac{t + C_1 \log n}{\sqrt n}
-    \right)
-    \leq C_2 e^{-C_3 t}.
-  \end{align*}
-  %
-  Further, $Z_n$
-  has the same covariance structure as $G_n$
-  in the sense that for all $x,\, x' \in \R$,
-  %
-  \begin{align*}
-    \E\big[Z_n(x) Z_n(x')\big]
-    = \E\big[G_n(x) G_n(x')\big].
-  \end{align*}
-  %
-  By independently sampling from the law of
-  $Z_n$ conditional on $X_1', \ldots, X_n'$,
-  we can assume that
-  $Z_n$ is a function only of $X_1', \ldots, X_n'$
-  and some independent random noise.
-
-\end{lemma}
-
-\begin{lemma}[Yurinskii coupling for Lipschitz i.n.i.d.\ empirical processes]
-  \label{lem:kernel_app_yurinskii_corollary}
-
-  For $n \geq 2$ let $X_1, \dots, X_n$
-  be independent but not necessarily identically distributed
-  (i.n.i.d.) random variables
-  taking values in a measurable space $(S, \cS)$
-  and let $\cX_n \subseteq \R$
-  be a compact interval
-  with $\left|\log \Leb(\cX_n)\right| \leq C_1 \log n$
-  where $C_1 > 0$ is a constant.
-  Let $g_n$ be measurable on $S \times \cX_n$ satisfying
-  $\sup_{\xi \in S} \sup_{x \in \cX_n} |g_n(\xi, x)| \leq M_n$
-  and
-  $\sup_{x \in \cX_n} \max_{1 \leq i \leq n} \Var[g_n(X_i, x)]
-  \leq \sigma_n^2$,
-  with $\left|\log M_n\right| \leq C_1 \log n$
-  and $\left|\log \sigma_n^2\right| \leq C_1 \log n$.
-  Suppose that $g_n$ satisfies the following uniform
-  Lipschitz condition:
-  %
-  \begin{align*}
-    \sup_{\xi \in S}
-    \sup_{x,x' \in \cX_n}
-    \left|
-    \frac{g_n(\xi, x) - g_n(\xi, x')}
-    {x-x'}
-    \right|
-    \leq
-    l_{n,\infty},
-  \end{align*}
-  %
-  and also the following $L^2$
-  Lipschitz condition:
-  %
-  \begin{align*}
-    \sup_{x,x' \in \cX_n}
-    \E\left[
-      \frac{1}{n}
-      \sum_{i=1}^n
-      \left|
-      \frac{g_n(X_i, x) - g_n(X_i, x')}
-      {x-x'}
-      \right|^2
-    \right]^{1/2}
-    \leq
-    l_{n,2},
-  \end{align*}
-  %
-  where $0 < l_{n,2} \leq l_{n,\infty}$,
-  $\left|\log l_{n,2}\right| \leq C_1 \log n$, and
-  $\left|\log l_{n,\infty}\right| \leq C_1 \log n$.
-  Then for any $t_n > 0$ with
-  $\left|\log t_n\right| \leq C_1 \log n$,
-  there is a probability space carrying
-  independent copies of $X_1, \ldots, X_n$ denoted $X_1', \ldots, X_n'$
-  and a mean-zero Gaussian process $Z_n(x)$
-  such that if we define the empirical process
-  %
-  $G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n
-  \big( g_n(X'_i,x) - \E[g_n(X'_i,x)] \big)$,
-  %
-  then
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{x \in \cX_n}
-      \big|
-      G_n(x) - Z_n(x)
-      \big|
-      > t_n
-    \right) \\
-    &\quad\leq
-    \frac{
-      C_2
-      \sigma_n
-      \sqrt{\Leb(\cX_n)}
-      \sqrt{\log n}
-      \sqrt{M_n + \sigma_n\sqrt{\log n}}
-    }{n^{1/4} t_n^2}
-    \sqrt{
-      l_{n,2}
-      \sqrt{\log n}
-      + \frac{l_{n,\infty}}{\sqrt n}
-    \log n}
-  \end{align*}
-  %
-  where $C_2 > 0$ is a constant depending only on $C_1$.
-  Further, $Z_n$
-  has the same covariance structure as $G_n$
-  in the sense that for all $x, x' \in \cX_n$,
-  %
-  \begin{align*}
-    \E\big[Z_n(x) Z_n(x')\big]
-    = \E\big[G_n(x) G_n(x')\big].
-  \end{align*}
-
-\end{lemma}
-
-\subsection{The Vorob'ev--Berkes--Philipp theorem}
-
-We present a generalization of the Vorob'ev--Berkes--Philipp theorem
-\citep{dudley1999uniform}
-which allows one to ``glue'' multiple random variables
-or stochastic processes onto the same probability space,
-while preserving some pairwise distributions.
-We begin with some definitions.
-
-\begin{definition}[Tree]
-  A \emph{tree} is a finite undirected graph which is connected and contains no
-  cycles or self-loops.
-\end{definition}
-
-\begin{definition}[Polish Borel probability space]
-  A \emph{Polish Borel probability space}
-  is a triple $(\cX, \cF, \P)$,
-  where $\cX$ is a Polish space
-  (a topological space metrizable by a complete separable metric),
-  $\cF$ is the Borel $\sigma$-algebra induced on $\cX$ by its topology,
-  and $\P$ is a probability measure on $(\cX, \cF)$.
-  Important examples of Polish spaces include $\R^d$ and
-  the Skorokhod space $\cD[0,1]^d$ for $d \geq 1$.
-  In particular,
-  one can consider vectors of real-valued random variables
-  or stochastic processes indexed by
-  compact subsets of $\R^d$ which have
-  almost surely continuous trajectories.
-\end{definition}
-
-\begin{definition}[Projection of a law]
-  Let $(\cX_1, \cF_1)$ and $(\cX_2, \cF_2)$
-  be measurable spaces, and
-  let $\P_{12}$ be a law on the
-  product space
-  $(\cX_1 \times \cX_2, \cF_1 \otimes \cF_2)$.
-  The \emph{projection} of $\P_{12}$
-  onto $\cX_1$ is the law
-  $\P_1$ defined on $(\cX_1, \cF_1)$
-  by $\P_1 = \P_{12} \circ \pi_1^{-1}$
-  where $\pi_1(x_1, x_2) = x_1$
-  is the first-coordinate projection.
-\end{definition}
-
-\begin{lemma}[Vorob'ev--Berkes--Philipp theorem, tree form]
-  \label{lem:kernel_app_vbp}
-
-  Let $\cT$ be a tree with vertex set $\cV = \{1, \ldots, n\}$
-  and edge set $\cE$.
-  Suppose that attached to each vertex $i$ is a
-  Polish Borel probability space
-  $(\cX_i, \cF_i, \P_i)$.
-  Suppose that attached to each edge $(i,j) \in \cE$
-  (where $i<j$ without loss of generality)
-  is a law $\P_{i j}$ on
-  $(\cX_i \times \cX_j, \cF_i \otimes \cF_j)$.
-  Assume that these laws are pairwise-consistent in the sense that
-  the projection of $\P_{i j}$ onto
-  $\cX_i$ (resp.\ $\cX_j$) is $\P_i$ (resp.\ $\P_j$)
-  for each $(i,j) \in \cE$.
-  Then there exists a law $\P$ on
-  %
-  \begin{align*}
-    \left(
-      \prod_{i=1}^n \cX_i, \
-      \bigotimes_{i=1}^n \cF_i
-    \right)
-  \end{align*}
-  %
-  such that the projection of $\P$
-  onto $\cX_i \times \cX_j$
-  is $\P_{i j}$ for each $(i,j) \in \cE$,
-  and therefore also the projection of $\P$
-  onto $\cX_i$ is $\P_i$ for each $i \in \cV$.
-
-\end{lemma}
-
-\begin{remark}
-  The requirement that $\cT$ must contain no cycles
-  is necessary in general.
-  To see this, consider the Polish Borel probability spaces
-  given by
-  $\cX_1 = \cX_2 = \cX_3 = \{0,1\}$,
-  their respective Borel $\sigma$-algebras,
-  and the pairwise-consistent probability measures:
-  %
-  \begin{align*}
-    1/2
-    &=
-    \P_1(0) = \P_2(0) = \P_3(0) \\
-    1/2
-    &=
-    \P_{12}(0,1) = \P_{12}(1,0) =
-    \P_{13}(0,1) = \P_{13}(1,0) =
-    \P_{23}(0,1) = \P_{23}(1,0).
-  \end{align*}
-  %
-  Each measure $\P_i$ places equal mass on 0 and 1, while $\P_{i j}$
-  asserts that each pair of realizations is a.s.\ not equal.
-  The graph of these laws forms a triangle,
-  which is not a tree.
-  Suppose that $(X_1,X_2,X_3)$ has distribution given by $\P$,
-  where $X_i \sim \P_i$ and $(X_i,X_j) \sim \P_{i j}$
-  for each $i,j$.
-  But then by definition of $\P_{i j}$ we have
-  $X_1 = 1-X_2 = X_3 = 1-X_1$ a.s.,
-  which is a contradiction.
-
-\end{remark}
-
-\begin{remark}
-
-  Two important applications of
-  Lemma~\ref{lem:kernel_app_vbp} include
-  the embedding of a random vector into a stochastic process
-  and the coupling of stochastic processes
-  onto the same probability space:
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      Let $X_1$ and $X_2$
-      be stochastic processes with
-      trajectories in $\cD[0,1]$.
-      For $x_1, \ldots, x_n \in [0,1]$
-      let $\tilde X_1 = (X_1(x_1), \ldots, X_1(x_n))$
-      be a random vector
-      and suppose that $\tilde X_1'$
-      is a copy of $\tilde X_1$.
-      Then there is a law $\P$ on
-      $\cD[0,1] \times \R^n \times \cD[0,1]$
-      such that restriction of $\P$ to
-      $\cD[0,1] \times \R^n$
-      is the law of $(X_1, \tilde X_1)$,
-      while the restriction of $\P$ to
-      $\R^n \times \cD[0,1]$
-      is the law of $(\tilde X_1',X_2)$.
-      In other words,
-      we can embed the vector $\tilde X_1'$
-      into a stochastic process $X_1$
-      while maintaining the joint distribution of
-      $\tilde X_1'$ and $X_2$.
-
-    \item
-      Let $X_1, X_1', \ldots, X_n, X_n'$
-      be stochastic processes with
-      trajectories in $\cD[0,1]$,
-      where $X_i'$ is a copy of $X_i$
-      for each $1 \leq i \leq n-1$.
-      Suppose that
-      $\P\big(\|X_{i+1} - X_i'\| > t)\leq r_i$
-      for each $1 \leq i \leq n-1$,
-      where $\|\cdot\|$ is a norm on $\cD[0,1]$.
-      Then there exist copies of
-      $X_1, \ldots, X_n$
-      denoted
-      $X_1'', \ldots, X_n''$
-      satisfying
-      $\P\big(\|X_{i+1}'' - X_i''\| > t)\leq r_i$
-      for each $1 \leq i \leq n$.
-      That is, all of the inequalities
-      can be satisfied simultaneously
-      on the same probability space.
-
-  \end{enumerate}
-
-\end{remark}
-
-\section{Proofs}
-\label{sec:kernel_app_proofs}
-
-We present full proofs of all the results stated in
-Chapter~\ref{ch:kernel} and Appendix~\ref{app:kernel}.
-
-\subsection{Preliminary lemmas}
-
-In this section we list some results
-in probability and U-statistic theory
-which are used in proofs of our main results.
-Other auxiliary lemmas will be introduced when
-they are needed.
-
-\begin{lemma}[Bernstein's inequality for independent random variables]
-  \label{lem:kernel_app_bernstein}
-
-  Let $X_1, \ldots, X_n$ be independent real-valued
-  random variables with
-  $\E[X_i] = 0$, $|X_i| \leq M$, and
-  $\E[X_i^2] \leq \sigma^2$,
-  where $M$ and $\sigma$ are non-random.
-  Then for all $t>0$,
-  %
-  \begin{align*}
-    \P \left(
-      \left| \frac{1}{n} \sum_{i=1}^n X_i \right| \geq t
-    \right)
-    \leq 2 \exp \left( -
-      \frac{t^2 n}
-      {2 \sigma^2 + \frac{2}{3} M t}
-    \right).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_bernstein}]
-
-  See for example
-  Lemma~2.2.9 in~\citet{van1996weak}.
-\end{proof}
-
-\begin{lemma}[The matrix Bernstein inequality]
-  \label{lem:kernel_app_matrix_bernstein}
-
-  For $1 \leq i \leq n$
-  let $X_i$ be independent symmetric $d \times d$
-  real random matrices
-  with expected values $\mu_i = \E[X_i]$.
-  Suppose that
-  $\|X_i - \mu_i\|_2 \leq M$
-  almost surely for all $1 \leq i \leq n$
-  where $M$ is non-random, and define
-  $\sigma^2 = \big\| \sum_i \E[(X_i - \mu_i)^2] \big\|_2$.
-  Then there exists a universal constant $C > 0$
-  such that
-  for any $t > 0$ and $q \geq 1$,
-  %
-  \begin{align*}
-    \P\left(
-      \left\|
-      \sum_{i=1}^n
-      \left(
-        X_i - \mu_i
-      \right)
-      \right\|_2
-      \geq
-      2 \sigma \sqrt{t}
-      + \frac{4}{3} M t
-    \right)
-    &\leq
-    2 d e^{-t}, \\
-    \E\left[
-      \left\|
-      \sum_{i=1}^n
-      \left(
-        X_i - \mu_i
-      \right)
-      \right\|_2^q
-    \right]^{1/q}
-    &\leq
-    C \sigma \sqrt{q + \log 2d}
-    + C M (q + \log 2d).
-  \end{align*}
-  %
-  Another simplified version of this is as follows:
-  suppose that
-  $\|X_i\|_2 \leq M$ almost surely,
-  so that
-  $\|X_i - \mu_i\|_2 \leq 2M$.
-  Then since
-  $\sigma^2 \leq n M^2$,
-  we have
-  %
-  \begin{align*}
-    \P\left(
-      \left\|
-      \sum_{i=1}^n
-      \left(
-        X_i - \mu_i
-      \right)
-      \right\|_2
-      \geq
-      4M \big(t + \sqrt{n t}\big)
-    \right)
-    &\leq
-    2 d e^{-t}, \\
-    \E\left[
-      \left\|
-      \sum_{i=1}^n
-      \left(
-        X_i - \mu_i
-      \right)
-      \right\|_2^q
-    \right]^{1/q}
-    &\leq
-    C M
-    \big(q + \log 2d + \sqrt{n(q + \log 2d)}\big).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_matrix_bernstein}]
-
-  See Lemma~3.2 in \citet{minsker2019moment}.
-\end{proof}
-
-\begin{lemma}[A maximal inequality for Gaussian vectors]
-  \label{lem:kernel_app_gaussian_vector_maximal}
-
-  Take $n \geq 2$.
-  Let $X_i \sim \cN(0, \sigma_i^2)$
-  for $1 \leq i \leq n$
-  with $\sigma_i^2 \leq \sigma^2$.
-  Then
-  %
-  \begin{align}
-    \label{eq:kernel_app_gaussian_vector_maximal}
-    \E\left[
-      \max_{1 \leq i \leq n}
-      X_i
-    \right]
-    &\leq
-    \sigma \sqrt{2 \log n}, \\
-    \label{eq:kernel_app_gaussian_vector_maximal_abs}
-    \E\left[
-      \max_{1 \leq i \leq n}
-      |X_i|
-    \right]
-    &\leq
-    2 \sigma \sqrt{\log n}.
-  \end{align}
-  %
-  If $\Sigma_1$ and $\Sigma_2$ are constant
-  positive semi-definite $n \times n$ matrices
-  and $N \sim \cN(0,I_n)$,
-  then
-  %
-  \begin{align}
-    \label{eq:kernel_app_gaussian_difference_psd}
-    \E\Big[
-      \big\|
-      \Sigma_1^{1/2} N
-      - \Sigma_2^{1/2} N
-      \big\|_\infty
-    \Big]
-    &\leq
-    2 \sqrt{\log n} \,
-    \big\|
-    \Sigma_1 - \Sigma_2
-    \big\|_2^{1/2}.
-  \end{align}
-  %
-  If further $\Sigma_1$ is
-  positive definite,
-  then
-  %
-  \begin{align}
-    \label{eq:kernel_app_gaussian_difference_pd}
-    \E\Big[
-      \big\|
-      \Sigma_1^{1/2} N
-      - \Sigma_2^{1/2} N
-      \big\|_\infty
-    \Big]
-    &\leq
-    \sqrt{\log n} \,
-    \lambda_{\min}(\Sigma_1)^{-1/2} \,
-    \big\|
-    \Sigma_1 - \Sigma_2
-    \big\|_2.
-  \end{align}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}]
-
-  For $t > 0$,
-  Jensen's inequality on the concave logarithm function
-  gives
-  %
-  \begin{align*}
-    \E\left[
-      \max_{1 \leq i \leq n}
-      X_i
-    \right]
-    &=
-    \frac{1}{t}
-    \E\left[
-      \log
-      \exp
-      \max_{1 \leq i \leq n}
-      t X_i
-    \right]
-    \leq
-    \frac{1}{t}
-    \log
-    \E\left[
-      \exp
-      \max_{1 \leq i \leq n}
-      t X_i
-    \right]
-    \leq
-    \frac{1}{t}
-    \log
-    \sum_{i=1}^n
-    \E\left[
-      \exp
-      t X_i
-    \right] \\
-    &=
-    \frac{1}{t}
-    \log
-    \sum_{i=1}^n
-    \exp
-    \left(
-      \frac{t^2 \sigma_i^2}{2}
-    \right)
-    \leq
-    \frac{1}{t}
-    \log n
-    + \frac{t \sigma^2}{2},
-  \end{align*}
-  %
-  by the Gaussian moment generating function.
-  Minimizing with $t = \sqrt{2 \log n} / \sigma$
-  yields \eqref{eq:kernel_app_gaussian_vector_maximal}.
-  For \eqref{eq:kernel_app_gaussian_vector_maximal_abs},
-  we use the symmetry of the Gaussian distribution:
-  %
-  \begin{align*}
-    \E\left[
-      \max_{1 \leq i \leq n}
-      |X_i|
-    \right]
-    &=
-    \E\left[
-      \max_{1 \leq i \leq n}
-      \{X_i, -X_i\}
-    \right]
-    \leq
-    \sigma \sqrt{2 \log 2n}
-    \leq
-    2 \sigma \sqrt{\log n}.
-  \end{align*}
-  %
-  For \eqref{eq:kernel_app_gaussian_difference_psd}
-  and \eqref{eq:kernel_app_gaussian_difference_pd},
-  note that
-  $\Sigma_1^{1/2} N - \Sigma_2^{1/2} N$
-  is Gaussian with covariance matrix
-  $\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$.
-  The variances of of its components are the diagonal
-  elements of this matrix, namely
-  %
-  \begin{align*}
-    \sigma_i^2
-    &=
-    \Var\big[
-      \big(\Sigma_1^{1/2} N - \Sigma_2^{1/2} N\big)_i
-    \big]
-    =
-    \Big(\big(
-        \Sigma_1^{1/2} - \Sigma_2^{1/2}
-    \big)^2\Big)_{ii}.
-  \end{align*}
-  %
-  Note that if $e_i$ is the
-  $i$th standard unit basis vector,
-  then for any real symmetric matrix $A$,
-  we have
-  $e_i^\T A^2 e_i = (A^2)_{ii}$,
-  so in particular
-  $(A^2)_{ii} \leq \|A\|_2^2$.
-  Therefore
-  %
-  \begin{align*}
-    \sigma_i^2
-    &\leq
-    \big\|
-    \Sigma_1^{1/2} - \Sigma_2^{1/2}
-    \big\|_2^2
-    =\vcentcolon
-    \sigma^2.
-  \end{align*}
-  %
-  Applying
-  \eqref{eq:kernel_app_gaussian_vector_maximal_abs}
-  then gives
-  %
-  \begin{align*}
-    \E\Big[
-      \big\|
-      \Sigma_1^{1/2} N
-      - \Sigma_2^{1/2} N
-      \big\|_\infty
-    \Big]
-    &\leq
-    2 \sqrt{\log n} \,
-    \big\|
-    \Sigma_1^{1/2} - \Sigma_2^{1/2}
-    \big\|_2.
-  \end{align*}
-  %
-  By Theorem~X.1.1
-  in \citet{bhatia1997matrix},
-  we can deduce
-  %
-  \begin{align*}
-    \big\|
-    \Sigma_1^{1/2} - \Sigma_2^{1/2}
-    \big\|_2
-    &\leq
-    \big\|
-    \Sigma_1 - \Sigma_2
-    \big\|_2^{1/2},
-  \end{align*}
-  %
-  giving
-  \eqref{eq:kernel_app_gaussian_difference_psd}.
-  If $\Sigma_1$
-  is positive definite,
-  Theorem~X.3.8 in
-  \citet{bhatia1997matrix} gives
-  \eqref{eq:kernel_app_gaussian_difference_pd}:
-  %
-  \begin{align*}
-    \big\|
-    \Sigma_1^{1/2} - \Sigma_2^{1/2}
-    \big\|_2
-    &\leq
-    \frac{1}{2}
-    \lambda_{\min}(\Sigma_1)^{-1/2} \,
-    \big\|
-    \Sigma_1 - \Sigma_2
-    \big\|_2.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{lemma}[Maximal inequalities for Gaussian processes]
-  \label{lem:kernel_app_gaussian_process_maximal}
-
-  Let $Z$ be a separable
-  mean-zero Gaussian process indexed
-  by $x \in \cX$.
-  Recall that $Z$ is separable for example if
-  $\cX$ is Polish and $Z$ has
-  continuous trajectories.
-  Define its covariance structure on $\cX \times \cX$
-  by $\Sigma(x, x') = \E[Z(x) Z(x')]$,
-  and the corresponding semimetric on $\cX$ by
-  %
-  \begin{align*}
-    \rho(x,x')
-    &=
-    \E\big[\big(Z(x) - Z(x')\big)^2\big]^{1/2}
-    = \big(\Sigma(x,x)
-      - 2 \Sigma(x,x')
-    + \Sigma(x',x')\big)^{1/2}.
-  \end{align*}
-  %
-  Let $N(\varepsilon, \cX, \rho)$
-  denote the $\varepsilon$-covering number of $\cX$
-  with respect to the semimetric $\rho$.
-  Define $\sigma = \sup_x \Sigma(x,x)^{1/2}$.
-  Then there exists a universal constant $C > 0$
-  such that for any $\delta > 0$,
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{x \in \cX}
-      |Z(x)|
-    \right]
-    &\leq
-    C \sigma
-    + C \int_0^{2\sigma}
-    \sqrt{\log N(\varepsilon, \cX, \rho)}
-    \diff{\varepsilon}, \\
-    \E\left[
-      \sup_{\rho(x,x') \leq \delta}
-      |Z(x) - Z(x')|
-    \right]
-    &\leq
-    C \int_0^{\delta}
-    \sqrt{\log N(\varepsilon, \cX, \rho)}
-    \diff{\varepsilon}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_process_maximal}]
-
-  See Corollary~2.2.8 in \citet{van1996weak},
-  noting that for any $x,x' \in \cX$, we have
-  $\E[|Z(x)|] \lesssim \sigma$ and
-  $\rho(x,x') \leq 2\sigma$,
-  implying that
-  $\log N(\varepsilon, \cX, \rho) = 0$
-  for all
-  $\varepsilon > 2 \sigma$.
-\end{proof}
-
-\begin{lemma}[Anti-concentration for Gaussian process absolute suprema]
-  \label{lem:kernel_app_anticoncentration}
-
-  Let $Z$ be a separable mean-zero Gaussian process
-  indexed by a semimetric space $\cX$ with
-  $\E[Z(x)^2] = 1$
-  for all $x \in \cX$.
-  Then for any $\varepsilon > 0$,
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{x \in \cX}
-      \big| Z(x) \big|
-      - t
-      \right|
-      \leq \varepsilon
-    \right)
-    &\leq
-    4 \varepsilon
-    \left(
-      1 + \E\left[
-        \sup_{x \in \cX}
-        \big| Z(x) \big|
-      \right]
-    \right).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_anticoncentration}]
-
-  See Corollary~2.1
-  in \citet{chernozhukov2014anti}.
-\end{proof}
-
-\begin{lemma}[No slowest rate of convergence in probability]
-  \label{lem:kernel_app_slow_convergence}
-
-  Let $X_n$ be a sequence of real-valued random
-  variables with
-  $X_n = o_\P(1)$.
-  Then there exists a deterministic sequence
-  $\varepsilon_n \to 0$
-  such that
-  $\P\big(|X_n| > \varepsilon_n\big) \leq \varepsilon_n$
-  for all $n \geq 1$.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_slow_convergence}]
-
-  Define the following deterministic sequence
-  for $k \geq 1$.
-  %
-  \begin{align*}
-    \tau_k
-    &=
-    \sup
-    \big\{
-      n \geq 1:
-      \P\big(|X_n| > 1/k\big)
-      > 1/k
-    \big\}
-    \vee
-    (\tau_{k-1} +1)
-  \end{align*}
-  %
-  with $\tau_0 = 0$.
-  Since $X_n = o_\P(1)$,
-  each $\tau_k$ is finite
-  and so we can define
-  $\varepsilon_n = \frac{1}{k}$
-  where $\tau_k < n \leq \tau_{k+1}$.
-  Then, noting that $\varepsilon_n \to 0$,
-  we have
-  $\P\big(|X_n| > \varepsilon_n\big)
-  = \P\big(|X_n| > 1/k\big) \leq 1/k = \varepsilon_n$.
-\end{proof}
-
-\begin{lemma}[General second-order Hoeffding-type decomposition]
-  \label{lem:kernel_app_general_hoeffding}
-
-  Let $\cU$ be a vector space.
-  Let $u_{i j} \in \cU$ be defined for
-  $1 \leq i, j \leq n$
-  and
-  $i \neq j$.
-  Suppose that $u_{i j} = u_{j i}$
-  for all $i,j$.
-  Then for any $u_i \in \cU$
-  (for $1 \leq i \leq n$)
-  and any $u \in \cU$,
-  the following decomposition holds:
-  %
-  \begin{align*}
-    \sum_{i=1}^n
-    \sum_{\substack{j=1 \\ j \neq i}}^n
-    \big(u_{i j} - u\big)
-    &=
-    2(n-1)
-    \sum_{i=1}^n
-    \big(u_i - u\big)
-    +
-    \sum_{i=1}^n
-    \sum_{\substack{j=1 \\ j \neq i}}^n
-    \big(u_{i j} - u_i - u_j + u\big).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_general_hoeffding}]
-
-  We compute the left hand side minus the right hand side,
-  beginning by observing that all of the
-  $u_{i j}$ and $u$ terms clearly cancel.
-  %
-  \begin{align*}
-    &\sum_{i=1}^n
-    \sum_{j \neq i}^n
-    \big(u_{i j} - u\big)
-    - 2(n-1)
-    \sum_{i=1}^n
-    \big(u_i - u\big)
-    -
-    \sum_{i=1}^n
-    \sum_{j \neq i}
-    \big(u_{i j} - u_i - u_j + u\big) \\
-    &\qquad=
-    - 2(n-1)
-    \sum_{i=1}^n
-    u_i
-    -
-    \sum_{i=1}^n
-    \sum_{j \neq i}^n
-    \big(- u_i - u_j\big)
-    =
-    - 2(n-1)
-    \sum_{i=1}^n
-    u_i
-    +
-    \sum_{i=1}^n
-    \sum_{j \neq i}^n
-    u_i
-    +
-    \sum_{j=1}^n
-    \sum_{i \neq j}^n
-    u_j \\
-    &\qquad=
-    - 2(n-1)
-    \sum_{i=1}^n
-    u_i
-    +
-    (n-1)
-    \sum_{i=1}^n
-    u_i
-    +
-    (n-1)
-    \sum_{j=1}^n
-    u_j
-    = 0.
-  \end{align*}
-\end{proof}
-
-\begin{lemma}[A U-statistic concentration inequality]
-  \label{lem:kernel_app_ustat_concentration}
-
-  Let $(S,\cS)$ be a measurable space and
-  $X_1, \ldots, X_n$ be i.i.d.\ $S$-valued random variables.
-  Let $H: S^m \to \R$ be a function of $m$ variables
-  satisfying the symmetry property
-  $H(x_1, \ldots, x_m) = H(x_{\tau (1)}, \ldots, x_{\tau (m)})$
-  for any $m$-permutation $\tau$.
-  Suppose also that
-  $\E[H(X_1, \ldots, X_m)] = 0$.
-  Let
-  $M = \|H\|_\infty$
-  and
-  $\sigma^2 = \E\big[\E[H(X_1, \ldots, X_m) \mid X_1]^2\big]$.
-  Define the U-statistic
-  %
-  \begin{align*}
-    U_n
-    &=
-    \frac{m!(n-m)!}{n!}
-    \sum_{1 \leq i_1 < \cdots < i_m \leq n}
-    H(X_1, \ldots, X_n).
-  \end{align*}
-  %
-  Then for any $t > 0$,
-  with $C_1(m)$, $C_2(m)$
-  positive constants depending only on $m$,
-  %
-  \begin{align*}
-    \P\left(
-      |U_n| > t
-    \right)
-    &\leq
-    4 \exp \left(
-      - \frac{n t^2}{C_1(m) \sigma^2 + C_2(m) M t}
-    \right).
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_ustat_concentration}]
-  See Theorem~2 in \citet{arcones1995bernstein}.
-\end{proof}
-
-\begin{lemma}[A second-order U-process maximal inequality]
-  \label{lem:kernel_app_uprocess_maximal}
-
-  Let $X_1 \ldots, X_n$
-  be i.i.d.\ random variables taking values
-  in a measurable space $(S, \cS)$
-  with distribution $\P$.
-  Let $\cF$ be a class of measurable functions from
-  $S \times S$ to $\R$ which is also pointwise measurable.
-  Define the degenerate second-order U-process
-  %
-  \begin{align*}
-    U_n(f)
-    =
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    &\Big(
-      f(X_i, X_j)
-      - \E\big[f(X_i,X_j) \mid X_i\big]
-      -
-      \E\big[f(X_i,X_j) \mid X_j\big]
-      + \E\big[f(X_i,X_j)\big]
-    \Big)
-  \end{align*}
-  %
-  for $f \in \cF$.
-  Suppose that each $f \in \cF$ is symmetric in the sense that
-  $f(s_1,s_2) = f(s_2,s_1)$
-  for all $s_1, s_2 \in S$.
-  Let $F$ be a measurable envelope function for $\cF$
-  satisfying $|f(s_1,s_2)| \leq F(s_1,s_2)$
-  for all $s_1,s_2 \in S$.
-  For a law $\Q$ on
-  $(S \times S, \, \cS \otimes \cS)$,
-  define the $(\Q,q)$-norm of $f \in \cF$ by
-  $\|f\|_{\Q,q}^q = \E_\Q[|f|^q]$.
-  Assume that $\cF$ is VC-type in the following manner.
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(
-      \cF, \|\cdot\|_{\Q,2}, \varepsilon \|F\|_{\Q,2}
-    \big)
-    &\leq
-    (C_1/\varepsilon)^{C_2}
-  \end{align*}
-  %
-  for some constants
-  $C_1 \geq e$
-  and
-  $C_2 \geq 1$,
-  and for all $\varepsilon \in (0,1]$,
-  where $\Q$ ranges over all finite discrete laws
-  on
-  $S \times S$.
-  Let $\sigma > 0$ be any deterministic value satisfying
-  $\sup_{f \in \cF} \|f\|_{\P,2} \leq \sigma \leq \|F\|_{\P,2}$,
-  and define the random variable $M = \max_{i,j} |F(X_i, X_j)|$.
-  Then there exists a universal constant $C_3 > 0$
-  satisfying
-  %
-  \begin{align*}
-    n
-    \E\left[
-      \sup_{f \in \cF}
-      \big| U_n(f) \big|
-    \right]
-    &\leq
-    C_3 \sigma
-    \Big(
-      C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big)
-    \Big)
-    + \frac{C_3 \|M\|_{\P,2}}{\sqrt{n}}
-    \Big(
-      C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big)
-    \Big)^2.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_uprocess_maximal}]
-
-  Apply Corollary~5.3
-  from \citet{chen2020jackknife}
-  with the order of the U-statistic fixed at
-  $r=2$,
-  and with $k=2$.
-\end{proof}
-
-\begin{lemma}[A U-statistic matrix concentration inequality]
-  \label{lem:kernel_app_ustat_matrix_concentration}
-
-  Let $X_1, \ldots, X_n$ be i.i.d.\ random variables
-  taking values in a measurable space $(S, \cS)$.
-  Suppose
-  $H: S^2 \to \R^{d \times d}$
-  is a measurable matrix-valued function
-  of two variables
-  satisfying the following:
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      $H(X_1, X_2)$ is an almost surely symmetric matrix.
-
-    \item
-      $\|H(X_1, X_2)\|_2 \leq M$ almost surely.
-
-    \item
-      $H$ is a symmetric function in its arguments in that
-      $H(X_1, X_2) = H(X_2, X_1)$.
-
-    \item
-      $H$ is degenerate in the sense that
-      $\E[H(X_1, x_2)] = 0$ for all $x_2 \in S$.
-
-  \end{enumerate}
-  %
-  Let $U_n = \sum_i \sum_{j \neq i} H(X_i, X_j)$
-  be a U-statistic,
-  and define the variance-type constant
-  %
-  \begin{align*}
-    \sigma^2
-    &=
-    \E\left[
-      \left\|
-      \E\left[
-        H(X_i, X_j)^2
-        \mid X_j
-      \right]
-      \right\|_2
-    \right].
-  \end{align*}
-  %
-  Then for a universal constant $C > 0$
-  and for all $t > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      C \sigma n (t + \log d)
-      + C M \sqrt{n} (t + \log d)^{3/2}
-    \right)
-    &\leq
-    C e^{-t}.
-  \end{align*}
-  %
-  By Jensen's inequality,
-  $\sigma^2 \leq \E[ \| H(X_i, X_j)^2 \|_2 ]
-  = \E[ \| H(X_i, X_j) \|_2^2 ] \leq M^2$, giving the simpler
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      2 C M n
-      (t + \log d)^{3/2}
-    \right)
-    &\leq
-    C e^{-t}.
-  \end{align*}
-  %
-  From this last inequality we deduce a moment bound
-  by integration of tail probabilities:
-  %
-  \begin{align*}
-    \E\left[
-      \|U_n\|_2
-    \right]
-    &\lesssim
-    M n (\log d)^{3/2}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}]
-
-  We apply results from \citet{minsker2019moment}.
-
-  \proofparagraph{decoupling}
-
-  Let $\bar U_n = \sum_{i=1}^n \sum_{j=1}^n H(X_i^{(1)}, X_j^{(2)})$
-  be a decoupled matrix U-statistic,
-  where $X^{(1)}$ and $X^{(2)}$
-  are i.i.d.\ copies of the sequence $X_1, \ldots, X_n$.
-  By Lemma~5.2 in \citet{minsker2019moment},
-  since we are only stating this result for
-  degenerate U-statistics of order 2,
-  there exists a universal constant $D_2$
-  such that for any $t > 0$,
-  we have
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2 \geq t
-    \right)
-    &\leq
-    D_2
-    \P\left(
-      \|\bar U_n\|_2 \geq t / D_2
-    \right).
-  \end{align*}
-
-  \proofparagraph{concentration of the decoupled U-statistic}
-
-  By Equation~11 in \citet{minsker2019moment},
-  we have the following concentration inequality
-  for decoupled degenerate U-statistics.
-  For some universal constant $C_1$
-  and for any $t > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      \|\bar U_n\|_2
-      \geq
-      C_1 \sigma n (t + \log d)
-      + C_1 M \sqrt{n} (t + \log d)^{3/2}
-    \right)
-    &\leq
-    e^{-t}.
-  \end{align*}
-
-  \proofparagraph{concentration of the original U-statistic}
-
-  Hence we have
-  %
-  \begin{align*}
-    &\P\left(
-      \|U_n\|_2
-      \geq
-      C_1 D_2 \sigma n (t + \log d)
-      + C_1 D_2 M \sqrt{n} (t + \log d)^{3/2}
-    \right) \\
-    &\quad\leq
-    D_2 \P\left(
-      \|\bar U_n\|_2
-      \geq
-      C_1 \sigma n (t + \log d)
-      + C_1 M \sqrt{n} (t + \log d)^{3/2}
-    \right)
-    \leq
-    D_2 e^{-t}.
-  \end{align*}
-  %
-  The main result follows by setting
-  $C = C_1 + C_1 D_2$.
-
-  \proofparagraph{moment bound}
-
-  We now obtain a moment bound for the simplified version.
-  We already have that
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      2 C M n
-      (t + \log d)^{3/2}
-    \right)
-    &\leq
-    C e^{-t}.
-  \end{align*}
-  %
-  This implies that for any $t \geq \log d$,
-  we have
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      8 C M n
-      t^{3/2}
-    \right)
-    &\leq
-    C e^{-t}.
-  \end{align*}
-  %
-  Defining
-  $s = 8 C M n t^{3/2}$
-  so $t = \left( \frac{s}{8C M n} \right)^{2/3}$
-  shows that for any $s \geq 8C M n(\log d)^{3/2}$,
-  %
-  \begin{align*}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      s
-    \right)
-    &\leq
-    C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}.
-  \end{align*}
-  %
-  Hence the moment bound is obtained:
-  %
-  \begin{align*}
-    \E\left[
-      \|U_n\|_2
-    \right]
-    &=
-    \int_0^\infty
-    \P\left(
-      \|U_n\|_2
-      \geq
-      s
-    \right)
-    \diff{s} \\
-    &=
-    \int_0^{8C M n(\log d)^{3/2}}
-    \P\left(
-      \|U_n\|_2
-      \geq
-      s
-    \right)
-    \diff{s}
-    +
-    \int_{8C M n(\log d)^{3/2}}^\infty
-    \P\left(
-      \|U_n\|_2
-      \geq
-      s
-    \right)
-    \diff{s} \\
-    &\leq
-    8C M n(\log d)^{3/2}
-    +
-    \int_0^\infty
-    C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}
-    \diff{s} \\
-    &=
-    8C M n(\log d)^{3/2}
-    +
-    8C M n
-    \int_0^\infty
-    e^{s^{-2/3}}
-    \diff{s}
-    \lesssim
-    Mn(\log d)^{3/2}.
-  \end{align*}
-\end{proof}
-
-\subsection{Technical lemmas}
-
-Before presenting the proof of
-Lemma~\ref{lem:kernel_app_maximal_entropy},
-we give some auxiliary lemmas;
-namely a symmetrization inequality
-(Lemma~\ref{lem:kernel_app_symmetrization}),
-a Rademacher contraction principle
-(Lemma~\ref{lem:kernel_app_contraction}),
-and a Hoffman--J{\o}rgensen inequality
-(Lemma~\ref{lem:kernel_app_hoffmann}).
-Recall that the Rademacher distribution
-places probability mass of $1/2$
-on each of the points $-1$ and $1$.
-
-\begin{lemma}[A symmetrization inequality for i.n.i.d.\ variables]
-  \label{lem:kernel_app_symmetrization}
-
-  Let $(S, \cS)$ be a measurable space and
-  $\cF$ a class of Borel-measurable functions
-  from $S$ to $\R$ which is pointwise measurable
-  (i.e.\ it contains a countable dense subset
-  under pointwise convergence).
-  Let $X_1, \ldots, X_n$
-  be independent
-  but not necessarily identically distributed
-  $S$-valued random variables.
-  Let $a_1, \ldots, a_n$ be arbitrary points in $S$
-  and $\phi$ a non-negative non-decreasing convex function
-  from $\R$ to $\R$.
-  Define $\varepsilon_1, \ldots, \varepsilon_n$
-  as independent Rademacher
-  random variables,
-  independent of $X_1, \ldots, X_n$.
-  Then
-  %
-  \begin{align*}
-    \E \left[
-      \phi \left(
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        \Big(
-          f(X_i)
-          - \E[f(X_i)]
-        \Big)
-        \right|
-      \right)
-    \right]
-    &\leq
-    \E \left[
-      \phi \left(
-        2
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        \Big(
-          f(X_i)
-          - a_i
-        \Big)
-        \right|
-      \right)
-    \right].
-  \end{align*}
-  %
-  Note that in particular this holds with $a_i = 0$
-  and also holds with $\phi(t) = t \vee 0$.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_symmetrization}]
-
-  See Lemma~2.3.6 in
-  \citet{van1996weak}.
-  %
-\end{proof}
-
-\begin{lemma}[A Rademacher contraction principle]
-  \label{lem:kernel_app_contraction}
-
-  Let $\varepsilon_1, \ldots, \varepsilon_n$
-  be independent Rademacher random variables
-  and $\cT$ be a bounded subset of $\R^n$.
-  Define
-  $M = \sup_{t \in \cT} \max_{1 \leq i \leq n} |t_i|$.
-  Then, noting that the supremum is measurable
-  because $\cT$ is a subset of a separable metric space
-  and is therefore itself separable,
-  %
-  \begin{align*}
-    \E
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i^2
-      \right|
-    \right]
-    &\leq
-    4M \,
-    \E
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i
-      \right|
-    \right].
-  \end{align*}
-  %
-  This gives the following corollary.
-  Let $X_1, \ldots, X_n$ be mutually independent
-  and also independent of $\varepsilon_1, \ldots, \varepsilon_n$.
-  Let $\cF$ be a pointwise measurable class of functions
-  from a measurable space $(S, \cS)$ to $\R$,
-  with measurable envelope $F$.
-  Define $M = \max_i F(X_i)$.
-  Then we obtain
-  %
-  \begin{align*}
-    \E
-    \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)^2
-      \right|
-    \right]
-    &\leq
-    4
-    \E
-    \left[
-      M
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-    \right].
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_contraction}]
-
-  Apply Theorem~4.12 from \citet{ledoux1991probability} with $F$ the identity
-  function and
-  %
-  \begin{align*}
-    \psi_i(s)
-    = \psi(s)
-    &=
-    \min
-    \left(
-      \frac{s^2}{2M},
-      \frac{M}{2}
-    \right).
-  \end{align*}
-  %
-  This is a weak contraction
-  (i.e.\ 1-Lipschitz)
-  because it is continuous,
-  differentiable on $(-M,M)$
-  with derivative bounded by
-  $|\psi'(s)| \leq |s|/M \leq 1$,
-  and constant outside $(-M,M)$.
-  Note that since $|t_i| \leq M$
-  by definition,
-  we have $\psi_i(t_i) = t_i^2 / (2M)$.
-  Hence
-  by Theorem~4.12
-  from \citet{ledoux1991probability},
-  %
-  \begin{align*}
-    \E
-    \left[
-      F
-      \left(
-        \frac{1}{2}
-        \sup_{t \in \cT}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        \psi_i(t_i)
-        \right|
-      \right)
-    \right]
-    &\leq
-    \E
-    \left[
-      F
-      \left(
-        \sup_{t \in \cT}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        t_i
-        \right|
-      \right)
-    \right], \\
-    \E
-    \left[
-      \frac{1}{2}
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      \frac{t_i^2}{2M}
-      \right|
-    \right]
-    &\leq
-    \E
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i
-      \right|
-    \right], \\
-    \E
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i^2
-      \right|
-    \right]
-    &\leq
-    4M \,
-    \E
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i
-      \right|
-    \right].
-  \end{align*}
-  %
-  For the corollary, set
-  $\cT = \left\{\big(f(X_1), \ldots, f(X_n)\big) : f \in \cF\right\}$.
-  For a fixed realization
-  $X_1, \ldots, X_n$,
-  %
-  \begin{align*}
-    \E_\varepsilon
-    \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)^2
-      \right|
-    \right]
-    &=
-    \E_\varepsilon
-    \left[
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i^2
-      \right|
-    \right] \\
-    &\leq 4
-    \E_\varepsilon
-    \left[
-      M
-      \sup_{t \in \cT}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      t_i
-      \right|
-    \right]
-    = 4 \E_\varepsilon
-    \left[
-      M
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-    \right].
-  \end{align*}
-  %
-  Taking an expectation over $X_1, \ldots, X_n$
-  and applying Fubini's theorem yields the result.
-\end{proof}
-
-\begin{lemma}[A Hoffmann--J{\o}rgensen inequality]
-  \label{lem:kernel_app_hoffmann}
-
-  Let $(S, \cS)$ be a measurable space
-  and $X_1, \ldots, X_n$
-  be $S$-valued random variables.
-  Suppose that
-  $\cF$ is a pointwise measurable class of functions from $S$ to $\R$
-  with finite envelope $F$.
-  Let $\varepsilon_1, \ldots, \varepsilon_n$
-  be independent Rademacher variables
-  independent of $X_1, \ldots, X_n$.
-  For $q \in (1, \infty)$,
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-      ^q
-    \right]
-    ^{1/q}
-    &\leq
-    C_q
-    \left(
-      \E \left[
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        f(X_i)
-        \right|
-      \right]
-      +
-      \E \left[
-        \max_{1 \leq i \leq n}
-        \sup_{f \in \cF}
-        \big| f(X_i) \big|^q
-      \right]^{1/q}
-    \right),
-  \end{align*}
-  %
-  where $C_q$ is a positive constant depending only on $q$.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_hoffmann}]
-
-  We use Talagrand's formulation of
-  a Hoffmann--J{\o}rgensen inequality.
-  Consider the
-  independent
-  $\ell^\infty(\cF)$-valued
-  random functionals $u_i$ defined by
-  $u_i(f) = \varepsilon_i f(X_i)$,
-  where $\ell^\infty(\cF)$
-  is the Banach space of bounded functions from
-  $\cF$ to $\R$,
-  equipped with the norm
-  $\|u\|_\cF = \sup_{f \in \cF} |u(f)|$.
-  Then Remark~3.4 in \citet{kwapien1991hypercontraction} gives
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      u_i(f)
-      \right|
-      ^q
-    \right]
-    ^{1/q}
-    &\leq
-    C_q
-    \left(
-      \E \left[
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        u_i(f)
-        \right|
-      \right]
-      +
-      \E \left[
-        \max_{1 \leq i \leq n}
-        \sup_{f \in \cF}
-        \left|
-        u_i(f)
-        \right|^q
-      \right]^{1/q}
-    \right) \\
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-      ^q
-    \right]
-    ^{1/q}
-    &\leq
-    C_q
-    \left(
-      \E \left[
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        f(X_i)
-        \right|
-      \right]
-      +
-      \E \left[
-        \max_{1 \leq i \leq n}
-        \sup_{f \in \cF}
-        \big| f(X_i) \big|^q
-      \right]^{1/q}
-    \right).
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_maximal_entropy}]
-
-  We follow the proof of Theorem~5.2
-  from \citet{chernozhukov2014gaussian},
-  using our i.n.i.d.\ versions of the symmetrization inequality
-  (Lemma~\ref{lem:kernel_app_symmetrization}),
-  Rademacher contraction principle
-  (Lemma~\ref{lem:kernel_app_contraction}),
-  and Hoffmann--J{\o}rgensen inequality
-  (Lemma~\ref{lem:kernel_app_hoffmann}).
-
-  Without loss of generality,
-  we may assume that $J(1, \cF, F) < \infty$
-  as otherwise there is nothing to prove,
-  and that $F > 0$ everywhere on $S$.
-  Let $\P_n = n^{-1} \sum_i \delta_{X_i}$
-  be the empirical distribution
-  of $X_i$,
-  and define the empirical variance bound
-  $\sigma_n^2 = \sup_\cF n^{-1} \sum_i f(X_i)^2$.
-  By the i.n.i.d.\ symmetrization inequality
-  (Lemma~\ref{lem:kernel_app_symmetrization}),
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{f \in \cF}
-      \big| G_n(f) \big|
-    \right]
-    &=
-    \frac{1}{\sqrt n}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \Big(
-        f(X_i)
-        - \E[f(X_i)]
-      \Big)
-      \right|
-    \right]
-    \leq
-    \frac{2}{\sqrt n}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-    \right],
-  \end{align*}
-  %
-  where $\varepsilon_1, \ldots, \varepsilon_n$
-  are independent Rademacher random variables,
-  independent of $X_1, \ldots, X_n$.
-  Then the standard entropy integral inequality
-  from the proof of Theorem~5.2 in
-  the supplemental materials for
-  \citet{chernozhukov2014gaussian}
-  gives for a universal constant $C_1 > 0$,
-  %
-  \begin{align*}
-    \frac{1}{\sqrt n}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-      \Bigm\vert
-      X_1, \ldots, X_n
-    \right]
-    &\leq
-    C_1 \|F\|_{\P_n,2}
-    \, J(\sigma_n / \|F\|_{\P_n,2}, \cF, F).
-  \end{align*}
-  %
-  Taking marginal expectations
-  and applying Jensen's inequality along with
-  a convexity result for the covering integral,
-  as in Lemma~A.2 in \citet{chernozhukov2014gaussian}, gives
-  %
-  \begin{align*}
-    Z
-    &\vcentcolon=
-    \frac{1}{\sqrt n}
-    \E \left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-    \right]
-    \leq
-    C_1 \|F\|_{\bar\P,2}
-    \, J(\E[\sigma_n^2]^{1/2} / \|F\|_{\bar\P,2}, \cF, F).
-  \end{align*}
-  %
-  Now use symmetrization
-  (Lemma~\ref{lem:kernel_app_symmetrization}),
-  the contraction principle
-  (Lemma~\ref{lem:kernel_app_contraction}),
-  the Cauchy--Schwarz inequality,
-  and the Hoffmann--J{\o}rgensen inequality
-  (Lemma~\ref{lem:kernel_app_hoffmann})
-  to deduce that
-  %
-  \begin{align*}
-    \E[\sigma_n^2]
-    &=
-    \E\left[
-      \sup_{f \in \cF}
-      \frac{1}{n}
-      \sum_{i=1}^n
-      f(X_i)^2
-    \right]
-    \leq
-    \sup_{f \in \cF}
-    \E_{\bar\P} \left[
-      f(X_i)^2
-    \right]
-    + \frac{1}{n}
-    \E\left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      f(X_i)^2
-      - \E \left[
-        f(X_i)^2
-      \right]
-      \right|
-    \right] \\
-    &\leq
-    \sigma^2
-    + \frac{2}{n}
-    \E\left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)^2
-      \right|
-    \right]
-    \leq
-    \sigma^2
-    + \frac{8}{n}
-    \E\left[
-      M
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|
-    \right] \\
-    &\leq
-    \sigma^2
-    + \frac{8}{n}
-    \E\left[
-      M^2
-    \right]^{1/2}
-    \E\left[
-      \sup_{f \in \cF}
-      \left|
-      \sum_{i=1}^n
-      \varepsilon_i
-      f(X_i)
-      \right|^2
-    \right]^{1/2} \\
-    &\leq
-    \sigma^2
-    + \frac{8}{n}
-    \|M\|_{\P,2} \,
-    C_2
-    \left(
-      \E \left[
-        \sup_{f \in \cF}
-        \left|
-        \sum_{i=1}^n
-        \varepsilon_i
-        f(X_i)
-        \right|
-      \right]
-      +
-      \E \left[
-        \max_{1 \leq i \leq n}
-        \sup_{f \in \cF}
-        \big| f(X_i) \big|^2
-      \right]^{1/2}
-    \right) \\
-    &=
-    \sigma^2
-    + \frac{8C_2}{n}
-    \|M\|_{\P,2} \,
-    \left(
-      \sqrt{n} Z
-      +
-      \|M\|_{\P,2}
-    \right)
-    \lesssim
-    \sigma^2
-    +
-    \frac{\|M\|_{\P,2} Z}{\sqrt n}
-    +
-    \frac{\|M\|_{\P,2}^2}{n},
-  \end{align*}
-  %
-  where $\lesssim$ indicates a bound up to a universal constant.
-  Hence taking a square root we see that,
-  following the notation from the proof of Theorem~5.2
-  in the supplemental materials to
-  \citet{chernozhukov2014gaussian},
-  %
-  \begin{align*}
-    \sqrt{\E[\sigma_n^2]}
-    &\lesssim
-    \sigma
-    +
-    \|M\|_{\P,2}^{1/2} Z^{1/2} n^{-1/4}
-    +
-    \|M\|_{\P,2} n^{-1/2}
-    \lesssim
-    \|F\|_{\bar\P,2}
-    \left( \Delta \vee \sqrt{DZ} \right),
-  \end{align*}
-  %
-  where
-  $\Delta^2 = \|F\|_{\bar\P,2}^{-2}
-  \big(\sigma^2 \vee (\|M\|_{\P,2}^2 / n) \big) \geq \delta^2$
-  and
-  $D = \|M\|_{\P,2} n^{-1/2} \|F\|_{\bar\P,2}^{-2}$.
-  Thus returning to our bound on $Z$,
-  we now have
-  %
-  \begin{align*}
-    Z
-    &\lesssim
-    \|F\|_{\bar\P,2}
-    \, J(\Delta \vee \sqrt{DZ}, \cF, F).
-  \end{align*}
-  %
-  The final steps proceed as
-  in the proof of Theorem~5.2
-  from \citet{chernozhukov2014gaussian},
-  considering cases separately for
-  $\Delta \geq \sqrt{DZ}$
-  and
-  $\Delta < \sqrt{DZ}$,
-  and applying convexity properties of
-  the entropy integral $J$.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_maximal_vc_inid}]
-
-  We assume the VC-type condition
-  %
-  $\sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) \leq
-  (C_1/\varepsilon)^{C_2}$
-  %
-  for all $\varepsilon \in (0,1]$,
-  with constants
-  $C_1 \geq e$ and $C_2 \geq 1$.
-  Hence for $\delta \in (0,1]$,
-  the entropy integral can be bounded as
-  %
-  \begin{align*}
-    J\big(\delta, \cF, F\big)
-    &=
-    \int_0^\delta
-    \sqrt{1 +
-    \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})}
-    \diff{\varepsilon}
-    \leq
-    \int_0^\delta
-    \sqrt{1 +
-    C_2 \log (C_1/\varepsilon)}
-    \diff{\varepsilon} \\
-    &\leq
-    \int_0^\delta
-    \left(
-      1 +
-      \sqrt{C_2 \log (C_1/\varepsilon)}
-    \right)
-    \diff{\varepsilon}
-    =
-    \delta
-    + \sqrt{C_2}
-    \int_0^\delta
-    \sqrt{\log (C_1/\varepsilon)}
-    \diff{\varepsilon} \\
-    &\leq
-    \delta
-    + \sqrt{\frac{C_2}{\log (C_1/\delta)}}
-    \int_0^\delta
-    \log (C_1/\varepsilon)
-    \diff{\varepsilon}
-    =
-    \delta
-    + \sqrt{\frac{C_2}{\log (C_1/\delta)}}
-    \big(
-      \delta
-      + \delta \log (C_1/\delta)
-    \big) \\
-    &\leq
-    3 \delta
-    \sqrt{C_2 \log (C_1/\delta)}.
-  \end{align*}
-  %
-  The remaining equations now follow
-  by Lemma~\ref{lem:kernel_app_maximal_entropy}.
-\end{proof}
-
-Before proving Lemma~\ref{lem:kernel_app_kmt_corollary},
-we give a bounded-variation characterization
-(Lemma~\ref{lem:kernel_app_bv_characterization}).
-
-\begin{lemma}[A characterization of bounded-variation functions]
-  \label{lem:kernel_app_bv_characterization}
-
-  Let $\cV_1$ be
-  the class of real-valued functions on $[0,1]$
-  which are 0 at 1 and have total variation bounded by 1.
-  Also define the class of
-  half-interval indicator functions $\cI = \{\I[0,t]: t \in [0,1]\}$.
-  For any topological vector space $\cX$,
-  define the symmetric convex hull of a subset $\cY \subseteq \cX$ as
-  %
-  \begin{align*}
-    \symconv \cY
-    &=
-    \left\{
-      \sum_{i=1}^n
-      \lambda_i
-      y_i :
-      \sum_{i=1}^n
-      \lambda_i
-      = 1, \
-      \lambda_i
-      \geq 0, \
-      y_i \in \cY \cup -\cY, \
-      n \in \N
-    \right\}.
-  \end{align*}
-  %
-  Denote its closure by $\overline\symconv \ \cY$.
-  Under the pointwise convergence topology,
-  $\cV_1 \subseteq \overline\symconv \ \cI$.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_bv_characterization}]
-
-  Firstly, let $\cD \subseteq \cV_1$
-  be the class of real-valued functions
-  on $[0,1]$
-  which are
-  0 at 1,
-  have total variation exactly 1,
-  and are weakly monotone decreasing.
-  Therefore, for $g \in \cD$, we have
-  $\|g\|_\TV = g(0) = 1$.
-  Let $S = \{s_1, s_2, \dots\} \subseteq [0,1]$
-  be the countable set of discontinuity points of $g$.
-  We want to find a sequence of
-  convex combinations of elements of
-  $\cI$ which converges pointwise to $g$.
-  To do this, first define the sequence of meshes
-  %
-  \begin{align*}
-    A_n =
-    \{s_k : 1 \leq k \leq n\}
-    \cup
-    \{k/n : 0 \leq k \leq n\},
-  \end{align*}
-  %
-  which satisfies
-  $\bigcup_n A_n = S \cup ([0,1] \cap \Q)$.
-  Endow $A_n$ with the ordering
-  induced by the canonical order on $\R$,
-  giving $A_n = \{a_1, a_2, \ldots\}$,
-  and define the sequence of functions
-  %
-  \begin{align*}
-    g_n(x)
-    = \sum_{k = 1}^{|A_n|-1}
-    \I[0,a_k]
-    \big( g(a_k) - g(a_{k+1}) \big),
-  \end{align*}
-  %
-  where clearly
-  $\I[0, a_k] \in \cI$,
-  $g(a_k) - g(a_{k+1}) \geq 0$,
-  and
-  $\sum_{k = 1}^{|A_n|-1}
-  \big(
-    g(a_k) - g(a_{k+1})
-  \big)
-  = g(0) - g(1) = 1$.
-  Therefore $g_n$ is a convex combination of elements of $\cI$.
-  Further, note that for
-  $a_k \in A_n$,
-  %
-  \begin{align*}
-    g_n(a_k)
-    = \sum_{j = k}^{|A_n|-1}
-    \big( g(a_j) - g(a_{j+1}) \big)
-    = g(a_k) - g(a_{|A_n|})
-    = g(a_k) - g(1)
-    = g(a_k).
-  \end{align*}
-  %
-  Hence if $x \in S$, then eventually $x \in A_n$ so $g_n(x) \to g(x)$.
-  Alternatively, if $x \not\in S$, then $g$ is continuous at $x$.
-  But $g_n \to g$ on the dense set $\bigcup_n A_n$,
-  so also $g_n(x) \to g(x)$.
-  Hence $g_n \to g$
-  pointwise on $[0,1]$.
-
-  Now take $f \in \cV_1$.
-  By the Jordan decomposition for
-  total variation functions
-  \citep{royden1988real},
-  we can write
-  $f = f^+ - f^-$,
-  with
-  $f^+$ and $f^-$ weakly decreasing,
-  $f^+(1) = f^-(1) = 0$,
-  and
-  $\|f^+\|_\TV + \|f^-\|_\TV = \|f\|_\TV$.
-  Supposing that both
-  $\|f^+\|_\TV$ and $\|f^-\|_\TV$
-  are strictly positive, let
-  $g_n^+$ approximate
-  the unit-variation function
-  $f^+/\|f^+\|_\TV$
-  and
-  $g_n^-$ approximate $f^-/\|f^-\|_\TV$
-  as above.
-  Then since trivially
-  %
-  \begin{align*}
-    f =
-    \|f^+\|_\TV f^+ / \|f^+\|_\TV
-    - \|f^-\|_\TV f^- / \|f^-\|_\TV
-    + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0,
-  \end{align*}
-  %
-  we have that
-  the convex combination
-  %
-  \begin{align*}
-    g_n^+ \|f^+\|_\TV
-    - g_n^- \|f^-\|_\TV
-    + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0
-  \end{align*}
-  %
-  converges pointwise to $f$.
-  This also holds if either of the total variations
-  $\|f^\pm\|_\TV$
-  are zero,
-  since then the corresponding sequence $g_n^\pm$
-  need not be defined.
-  Now note that each of
-  $g_n^+$, $\,-g_n^-$, and $0$
-  are in $\symconv \cI$, so
-  $f \in \overline\symconv \ \cI$
-  under pointwise convergence.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_kmt_corollary}]
-
-  We follow the Gaussian approximation method given in
-  Section~2 of \citet{gine2004kernel}.
-  The KMT approximation theorem \citep{komlos1975approximation}
-  asserts the existence
-  of a probability space
-  carrying $n$ i.i.d.\ uniform random variables
-  $\xi_1, \ldots, \xi_n \sim \Unif[0,1]$
-  and a standard Brownian motion
-  $B_n(s): s \in [0,1]$
-  such that if
-  %
-  \begin{align*}
-    \alpha_n(s)
-    &\vcentcolon=
-    \frac{1}{\sqrt{n}}
-    \sum_{i=1}^n
-    \big(
-      \I\{\xi_i \leq s\} - s
-    \big),
-    &\beta_n(s)
-    &\vcentcolon=
-    B_n(s) - s B_n(1),
-  \end{align*}
-  %
-  then
-  for some universal positive constants
-  $C_1$, $C_2$, $C_3$,
-  and for all $t > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{s \in [0,1]}
-      \big| \alpha_n(s) - \beta_n(s) \big|
-      > \frac{t + C_1\log n}{\sqrt{n}}
-    \right)
-    \leq C_2 e^{-C_3 t}.
-  \end{align*}
-  %
-  We can
-  view $\alpha_n$ and $\beta_n$ as random functionals
-  defined on the class of
-  half-interval indicator functions
-  $\cI = \big\{\I[0,s]: s \in [0,1]\big\}$
-  in the following way.
-  %
-  \begin{align*}
-    \alpha_n(\I[0,s])
-    &= \frac{1}{\sqrt{n}}
-    \sum_{i=1}^n
-    \big( \I[0,s](\xi_i) - \E[\I[0,s](\xi_i)]), \\
-    \beta_n(\I[0,s])
-    &= \int_0^1 \I[0,s](u) \diff{B_n(u)}
-    - B_n(1) \int_0^1 \I[0,s](u) \diff{u},
-  \end{align*}
-  %
-  where the integrals are defined as It{\^o} and
-  Riemann--Stieltjes integrals in
-  the usual way for stochastic integration against semimartingales
-  \citep[Chapter~5]{legall2016brownian}.
-  Now we extend their definitions to the class
-  $\cV_1$
-  of functions on $[0,1]$
-  which are 0 at 1 and have total variation bounded by 1.
-  This is achieved by
-  noting that by Lemma~\ref{lem:kernel_app_bv_characterization},
-  we have
-  $\cV_1 \subseteq \overline\symconv \ \cI$
-  where $\overline{\symconv} \ \cI$ is the
-  smallest
-  symmetric convex class containing $\cI$
-  which is closed under pointwise convergence.
-  Thus by the dominated convergence theorem,
-  every function in $\cV_1$ is approximated in $L^2$ by finite convex
-  combinations of functions in $\pm\cI$,
-  and the extension to $g \in \cV_1$ follows
-  by linearity and $L^2$ convergence of (stochastic) integrals:
-  %
-  \begin{align*}
-    \alpha_n(g)
-    &=
-    \frac{1}{\sqrt{n}}
-    \sum_{i=1}^n
-    \big( g(\xi_i) - \E[g(\xi_i)]),
-    &\beta_n(g)
-    &= \int_0^1 g(s) \diff{B_n(s)}
-    - B_n(1) \int_0^1 g(s) \diff{s}.
-  \end{align*}
-  %
-  Now we show that the norm induced on
-  $(\alpha_n - \beta_n)$
-  by the function class $\cV_1$ is a.s.\ identical to the
-  supremum norm.
-  Writing the sums as integrals and using integration by parts
-  for finite-variation Lebesgue--Stieltjes and It\^o integrals,
-  and recalling that $g(1) = \alpha_n(0) = B_n(0) = 0$,
-  %
-  \begin{align*}
-    \sup_{g \in \cV_1}
-    \big|\alpha_n(g) - \beta_n(g)\big|
-    &=
-    \sup_{g \in \cV_1}
-    \left|
-    \int_0^1 g(s) \diff{\alpha_n(s)}
-    - \int_0^1 g(s) \diff{B_n(s)}
-    + B_n(1) \int_0^1 g(s) \diff{s}
-    \right| \\
-    &=
-    \sup_{g \in \cV_1}
-    \left|
-    \int_0^1 \alpha_n(s) \diff{g(s)}
-    - \int_0^1 B_n(s) \diff{g(s)}
-    + B_n(1) \int_0^1 s \diff{g(s)}
-    \right| \\
-    &=
-    \sup_{g \in \cV_1}
-    \left|
-    \int_0^1 \big(\alpha_n(s) - \beta_n(s)\big)
-    \diff{g(s)}
-    \right|
-    = \sup_{s \in [0,1]}
-    \big|
-    \alpha_n(s) - \beta_n(s)
-    \big|,
-  \end{align*}
-  %
-  where in the last line
-  the upper bound is because $\|g\|_\TV \leq 1$,
-  and the lower bound is by taking
-  $g_\varepsilon = \pm \I[0,s_\varepsilon]$ where
-  $|\alpha_n(s_\varepsilon) - \beta_n(s_\varepsilon)|
-  \geq \sup_s |\alpha_n(s) - \beta_n(s)| -
-  \varepsilon$.
-  Hence we obtain
-  %
-  \begin{align}
-    \label{eq:kernel_app_kmt_concentration}
-    \P\left(
-      \sup_{g \in \cV_1}
-      \big|\alpha_n(g) - \beta_n(g)\big|
-      > \frac{t + C_1\log n}{\sqrt{n}}
-    \right)
-    \leq C_2 e^{-C_3 t}.
-  \end{align}
-  %
-  Now define $V_n = \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV$,
-  noting that if $V_n = 0$ then the result is trivially true
-  by setting $Z_n = 0$.
-  Let $F_X$ be the common c.d.f.\ of $X_i$,
-  and define the quantile function
-  $F_X^{-1}(s) = \inf \{u: F_X(u) \geq s\}$ for $s \in [0,1]$,
-  writing $\inf \emptyset = \infty$
-  and $\inf \R = -\infty$.
-  Consider the function class
-  %
-  \begin{align*}
-    \cG_n = \big\{
-      V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
-      - V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
-    : x \in \R \big\},
-  \end{align*}
-  %
-  noting that $g_n(\cdot,x)$
-  is finite-variation so
-  $g_n(\pm \infty, x)$
-  can be interpreted as
-  the relevant limit.
-  By monotonicity of $F_X$ and the definition of $V_n$,
-  the members of $\cG_n$ have total variation of at most $1$
-  and are 0 at 1, implying that
-  $\cG_n \subseteq \cV_1$.
-  Noting that $\alpha_n$ and $\beta_n$ are random
-  linear operators which a.s.\ annihilate
-  constant functions,
-  define
-  %
-  \begin{align*}
-    Z_n(x)
-    &=
-    \beta_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big)
-    = V_n \beta_n \Big(
-      V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
-      - V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
-    \Big),
-  \end{align*}
-  %
-  which is a mean-zero continuous Gaussian process.
-  Its covariance structure is
-  %
-  \begin{align*}
-    &\E[Z_n(x) Z_n(x')] \\
-    &=
-    \E\bigg[
-      \left(
-        \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
-        - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
-      \right) \\
-      &\quad\times
-      \left(
-        \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
-        - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s}
-      \right)
-    \bigg] \\
-    &=
-    \E\left[
-      \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
-      \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
-    \right] \\
-    &\quad- \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \
-    \E\left[
-      B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
-    \right] \\
-    &\quad-
-    \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \
-    \E\left[
-      B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
-    \right] \\
-    &\quad+
-    \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
-    \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \
-    \E\left[
-      B_n(1)^2
-    \right] \\
-    &=
-    \int_0^1 g_n\big(F_X^{-1}(s),x\big)
-    g_n\big(F_X^{-1}(s),x'\big) \diff{s}
-    - \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
-    \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \\
-    &=
-    \E\Big[
-      g_n\big(F_X^{-1}(\xi_i), x\big)
-      g_n\big(F_X^{-1}(\xi_i), x'\big)
-    \Big]
-    - \E\Big[
-      g_n\big(F_X^{-1}(\xi_i), x\big)
-    \Big]
-    \E\Big[
-      g_n\big(F_X^{-1}(\xi_i), x'\big)
-    \Big] \\
-    &=
-    \E\Big[
-      g_n\big(X_i, x\big)
-      g_n\big(X_i, x'\big)
-    \Big]
-    - \E\Big[
-      g_n\big(X_i, x\big)
-    \Big]
-    \E\Big[
-      g_n\big(X_i, x'\big)
-    \Big]
-    =
-    \E\big[
-      G_n(x)
-      G_n(x')
-    \big]
-  \end{align*}
-  %
-  as desired, by the It\^o isometry for stochastic integrals,
-  writing $B_n(1) = \int_0^1 \diff{B_n(s)}$;
-  and noting that $F_X^{-1}(\xi_i)$
-  has the same distribution as $X_i$.
-  Finally, note that
-  %
-  \begin{align*}
-    G_n(x)
-    &=
-    \alpha_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big)
-    = V_n \alpha_n \Big(
-      V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
-      - V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
-    \Big),
-  \end{align*}
-  %
-  and so by \eqref{eq:kernel_app_kmt_concentration}
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{x \in \R}
-      \Big|G_n(x) - Z_n(x)\Big|
-      > V_n \frac{t + C_1 \log n}{\sqrt n}
-    \right)
-    &\leq
-    \P\left(
-      \sup_{g \in \cV_1}
-      \big|\alpha_n(g) - \beta_n(g)\big|
-      > \frac{t + C_1\log n}{\sqrt{n}}
-    \right) \\
-    &\leq C_2 e^{-C_3 t}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_yurinskii_corollary}]
-
-  Take $0 < \delta_n \leq \Leb(\cX_n)$ and let
-  $\cX_n^\delta = \big\{ x_1, \dots, x_{|\cX_n^\delta|}\big\}$
-  be a $\delta_n$-covering of $\cX_n$ with cardinality
-  $|\cX_n^\delta| \leq \Leb(\cX_n)/\delta_n$.
-  Suppose that $\left|\log \delta_n\right| \lesssim C_1 \log n$
-  up to a universal constant.
-  We first use the Yurinskii coupling to
-  construct a Gaussian process
-  $Z_n$
-  which is close to $G_n$
-  on this finite cover.
-  Then we bound the fluctuations in $G_n$
-  and in $Z_n$
-  using entropy methods.
-
-  \proofparagraph{Yurinskii coupling}
-
-  Define the i.n.i.d.\
-  and mean-zero variables
-  %
-  \begin{align*}
-    h_i(x)
-    &=
-    \frac{1}{\sqrt n}
-    \Big(
-      g_n(X_i', x)
-      - \E[g_n(X_i', x)]
-    \Big),
-  \end{align*}
-  %
-  where $X_1', \ldots, X_n'$
-  are independent copies of $X_1, \ldots, X_n$
-  on some new probability space,
-  so that we have
-  $G_n(x) = \sum_{i=1}^n h_i(x)$
-  in distribution.
-  Also define the length-$|\cX_n^\delta|$ random vector
-  %
-  \begin{align*}
-    h_i^\delta
-    &=
-    \big(
-      h_i(x): x \in \cX_n^\delta
-    \big).
-  \end{align*}
-  %
-  By an extension of
-  Yurinskii's coupling
-  to general norms
-  \citep[supplemental materials, Lemma~38]{belloni2019conditional},
-  there exists on the new probability space a
-  Gaussian length-$|\cX_n^\delta|$ vector $Z_n^\delta$
-  which is mean-zero
-  and with the same covariance structure as
-  $
-  \sum_{i=1}^n
-  h_i^\delta
-  $
-  satisfying
-  %
-  \begin{align*}
-    \P\left(
-      \bigg\|
-      \sum_{i=1}^n
-      h_i^\delta
-      - Z_n^\delta
-      \bigg\|_\infty
-      > 3 t_n
-    \right)
-    \leq
-    \min_{s > 0}
-    \left(
-      2 \P\big( \|N\|_\infty > s)
-      + \frac{\beta s^2}{t_n^3}
-    \right),
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    \beta
-    = \sum_{i=1}^n
-    \Big(
-      \E\big[\|h_i^\delta\|_2^2 \,
-        \|h_i^\delta\|_\infty
-      \big]
-      + \E\big[\|z_i\|_2^2 \,
-        \|z_i\|_\infty
-      \big]
-    \Big),
-  \end{align*}
-  %
-  with $z_i \sim \cN(0, \Var[h_i^\delta])$
-  independent and $N \sim \cN(0, I_{|\cX_n^\delta|})$.
-  By the bounds on $g_n$,
-  %
-  \begin{align*}
-    \E\big[\|h_i^\delta\|_2^2 \,
-      \|h_i^\delta\|_\infty \,
-    \big]
-    \leq
-    \frac{M_n}{\sqrt n}
-    \E\big[\|h_i^\delta\|_2^2 \,
-    \big]
-    =
-    \frac{M_n}{\sqrt n}
-    \sum_{x \in \cX_n^\delta}
-    \E\big[h_i(x)^2 \,
-    \big]
-    \leq
-    \frac{M_n}{\sqrt n}
-    \frac{|\cX_n^\delta| \sigma_n^2}{n}
-    \leq
-    \frac{M_n \sigma_n^2 \Leb(\cX_n)}{n^{3/2}\delta_n}.
-  \end{align*}
-  %
-  By the fourth moment bound for Gaussian variables,
-  %
-  \begin{align*}
-    \E\big[
-      \|z_i\|_2^4 \,
-    \big]
-    &\leq
-    |\cX_n^\delta| \,
-    \E\big[
-      \|z_i\|_4^4
-    \big]
-    \leq
-    |\cX_n^\delta|^2 \,
-    \max_j
-    \E\big[
-      (z_i^{(j)})^4
-    \big]
-    \leq
-    3
-    |\cX_n^\delta|^2 \,
-    \max_j
-    \E\big[
-      (z_i^{(j)})^2
-    \big]^2 \\
-    &=
-    3
-    |\cX_n^\delta|^2 \,
-    \max_{x \in \cX_n^\delta}
-    \E\big[
-      h_i(x)^2
-    \big]^2
-    \leq
-    \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2} .
-  \end{align*}
-  %
-  Also by Jensen's inequality
-  and for $|\cX_n^\delta| \geq 2$,
-  assuming $C_1 > 1$ without loss of generality,
-  %
-  \begin{align*}
-    \E\big[
-      \|z_i\|_\infty^2
-    \big]
-    &\leq
-    \frac{4 \sigma_n^2}{n}
-    \log
-    \E\big[
-      e^{\|z_i\|_\infty^2 / (4\sigma_n^2/n)}
-    \big]
-    \leq
-    \frac{4 \sigma_n^2}{n}
-    \log
-    \E\left[
-      \sum_{j=1}^{|\cX_n^\delta|}
-      e^{(z_i^{(j)})^2 / (4\sigma_n^2/n)}
-    \right]
-    \leq
-    \frac{4\sigma_n^2}{n}
-    \log \big(2|\cX_n^\delta|\big) \\
-    &\leq
-    \frac{4\sigma_n^2}{n}
-    \left(
-      \log 2 + \log \Leb(\cX_n) - \log \delta_n
-    \right)
-    \leq
-    \frac{12 C_1 \sigma_n^2 \log n}{n},
-  \end{align*}
-  %
-  where we used the moment
-  generating function of a $\chi_1^2$ random variable.
-  Therefore we can apply the Cauchy--Schwarz inequality
-  to obtain
-  %
-  \begin{align*}
-    \E\big[\|z_i\|_2^2 \,
-      \|z_i\|_\infty
-    \big]
-    &\leq
-    \sqrt{
-      \E\big[\|z_i\|_2^4
-    \big]}
-    \sqrt{
-      \E\big[
-        \|z_i\|_\infty^2
-    \big]}
-    \leq
-    \sqrt{
-    \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2}}
-    \sqrt{ \frac{12 C_1 \sigma_n^2 \log n}{n} } \\
-    &\leq
-    \frac{6\sigma_n^3
-      \Leb(\cX_n)
-    \sqrt{C_1 \log n}}{n^{3/2} \delta_n}.
-  \end{align*}
-  %
-  Now summing over the $n$ samples gives
-  %
-  \begin{align*}
-    \beta
-    \leq
-    \frac{M_n \sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n}
-    + \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}}
-    {\sqrt n \delta_n}
-    =
-    \frac{\sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n}
-    \Big(M_n + 6\sigma_n \sqrt{C_1 \log n}\Big).
-  \end{align*}
-  %
-  By a union bound
-  and Gaussian tail probabilities,
-  we have that
-  $\P\big( \|N\|_\infty > s)
-  \leq 2|\cX_n^\delta| e^{-s^2/2}$.
-  Thus we get the following Yurinskii coupling inequality
-  for all $s > 0$:
-  %
-  \begin{align*}
-    \P\left(
-      \bigg\|
-      \sum_{i=1}^n
-      h_i^\delta
-      - Z_n^\delta
-      \bigg\|_\infty
-      > t_n
-    \right)
-    &\leq
-    \frac{4 \Leb(\cX_n)}{\delta_n}
-    e^{-s^2/2}
-    + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
-    \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big).
-  \end{align*}
-  %
-  Note that
-  $Z_n^\delta$
-  now extends
-  by the Vorob'ev--Berkes--Philipp theorem
-  (Lemma~\ref{lem:kernel_app_vbp})
-  to a mean-zero Gaussian
-  process
-  $Z_n$ on the compact interval $\cX_n$
-  with covariance structure
-  %
-  \begin{align*}
-    \E\big[
-      Z_n(x)
-      Z_n(x')
-    \big]
-    =
-    \E\big[
-      G_n(x)
-      G_n(x')
-    \big],
-  \end{align*}
-  %
-  satisfying for any $s' > 0$
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{x \in \cX_n^\delta}
-      \big|
-      G_n(x) - Z_n(x)
-      \big|
-      > t_n
-    \right)
-    \leq
-    \frac{4 \Leb(\cX_n)}{\delta_n}
-    e^{-s^2/2}
-    + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
-    \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big).
-  \end{align*}
-
-  \proofparagraph{regularity of $G_n$}
-
-  Next we bound the fluctuations in
-  the empirical process $G_n$.
-  Consider the following classes of functions on $S$
-  and their associated (constant) envelope functions.
-  By continuity of $g_n$,
-  each class is pointwise measurable
-  (to see this, restrict the index sets to rationals).
-  %
-  \begin{align*}
-    \cG_n
-    &=
-    \big\{
-      g_n(\cdot, x):
-      x \in \cX_n
-    \big\},
-    &\Env(\cG_n)
-    &=
-    M_n, \\
-    \cG_n^\delta
-    &=
-    \big\{
-      g_n(\cdot, x)
-      - g_n(\cdot, x'):
-      x, x' \in \cX_n,
-      |x-x'| \leq \delta_n
-    \big\},
-    &\Env(\cG_n^\delta)
-    &=
-    l_{n,\infty} \delta_n.
-  \end{align*}
-  %
-  We first show these are VC-type.
-  By the uniform Lipschitz assumption,
-  %
-  \begin{align*}
-    \big\|
-    g_n(\cdot, x)
-    - g_n(\cdot, x')
-    \big\|_\infty
-    &\leq l_{n,\infty} |x-x'|
-  \end{align*}
-  %
-  for all $x,x' \in \cX_n$.
-  Therefore, with $\Q$ ranging over the
-  finitely-supported distributions
-  on $(S, \cS)$,
-  noting that any $\|\cdot\|_\infty$-cover
-  is a $\rho_\Q$-cover,
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(\cG_n, \rho_\Q, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big)
-    &\leq
-    N\big(\cG_n, \|\cdot\|_\infty,
-    \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big)
-    \leq
-    N\big(\cX_n, |\cdot|, \varepsilon \!\Leb(\cX_n)\big)
-    \leq
-    1/\varepsilon.
-  \end{align*}
-  %
-  Replacing $\varepsilon$ by
-  $\varepsilon M_n/(l_{n,\infty} \Leb(\cX_n))$
-  gives
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(\cG_n, \rho_\Q, \varepsilon M_n \big)
-    &\leq
-    \frac{l_{n,\infty} \Leb(\cX_n)}{\varepsilon M_n},
-  \end{align*}
-  %
-  and so $\cG_n$
-  is a VC-type class.
-  To see that $\cG_n^\delta$
-  is also a VC-type class,
-  we construct a cover in the following way.
-  Let $\cF_n$ be an $\varepsilon$-cover
-  for $(\cG_n, \|\cdot\|_\infty)$.
-  By the triangle inequality,
-  $\cF_n - \cF_n$ is a $2\varepsilon$-cover
-  for $(\cG_n - \cG_n, \|\cdot\|_\infty)$
-  of cardinality at most $|\cF_n|^2$,
-  where the subtractions are set subtractions.
-  Since $\cG_n^\delta \subseteq \cG_n - \cG_n$,
-  we see that $\cF_n - \cF_n$ is a $2\varepsilon$-external cover
-  for $\cG_n^\delta$. Thus
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \Leb(\cX_n)\big)
-    &\leq
-    N\big(\cG_n^\delta, \|\cdot\|_\infty,
-    \varepsilon l_{n,\infty} \Leb(\cX_n)\big) \\
-    &\leq
-    N\big(\cG_n, \|\cdot\|_\infty,
-    \varepsilon l_{n,\infty} \Leb(\cX_n)\big)^2
-    \leq
-    1/\varepsilon^2.
-  \end{align*}
-  %
-  Replacing $\varepsilon$ by
-  $\varepsilon \delta_n/\Leb(\cX_n)$
-  gives
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \delta_n \big)
-    &\leq
-    \frac{\Leb(\cX_n)^2}{\varepsilon^2 \delta_n^2}
-    \leq
-    (C_{1,n}/\varepsilon)^{2}
-  \end{align*}
-  %
-  with $C_{1,n} = \Leb(\cX_n) / \delta_n$,
-  demonstrating that $\cG_n^\delta$
-  forms a VC-type class.
-  We now apply the maximal inequality
-  for i.n.i.d.\ data
-  given in
-  Lemma~\ref{lem:kernel_app_maximal_vc_inid}.
-  To do this,
-  note that
-  $\sup_{\cG_n^\delta} \|g\|_{\bar\P,2}
-  \leq l_{n,2} \delta_n$
-  by the $L^2$ Lipschitz condition, and recall
-  $\Env(\cG_n^\delta) = l_{n,\infty} \delta_n$.
-  Therefore Lemma~\ref{lem:kernel_app_maximal_vc_inid} with
-  $\|F\|_{\bar\P,2} = l_{n,\infty} \delta_n$,
-  $\|M\|_{\P,2} = l_{n,\infty} \delta_n$,
-  and $\sigma = l_{n,2} \delta_n$
-  gives,
-  up to universal constants
-  %
-  \begin{align*}
-    &\E\left[
-      \sup_{g \in \cG_n^\delta}
-      \left|
-      \frac{1}{\sqrt{n}}
-      \sum_{i=1}^n
-      \Big(
-        g(X_i)
-        - \E[g(X_i)]
-      \Big)
-      \right|
-    \right] \\
-    &\quad\lesssim
-    \sigma
-    \sqrt{2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)}
-    +
-    \frac{\|M\|_{\P,2} 2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)}
-    {\sqrt{n}} \\
-    &\quad\lesssim
-    l_{n,2} \delta_n
-    \sqrt{C_1 \log n}
-    +
-    \frac{l_{n,\infty} \delta_n}{\sqrt n}
-    C_1 \log n,
-  \end{align*}
-  %
-  and hence by Markov's inequality,
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{|x-x'| \leq \delta_n}
-      \big|
-      G_n(x) - G_n(x')
-      \big|
-      > t_n
-    \right) \\
-    &=
-    \P\left(
-      \sup_{|x-x'| \leq \delta_n}
-      \frac{1}{\sqrt{n}}
-      \left|
-      \sum_{i=1}^n
-      \Big(
-        g_n(X_i, x) - \E[g_n(X_i, x)]
-        - g_n(X_i, x') + \E[g_n(X_i, x')]
-      \Big)
-      \right|
-      > t_n
-    \right) \\
-    &=
-    \P\left(
-      \sup_{g \in \cG_n^\delta}
-      \left|
-      \frac{1}{\sqrt{n}}
-      \sum_{i=1}^n
-      \Big(
-        g(X_i) - \E[g(X_i)]
-      \Big)
-      \right|
-      > t_n
-    \right)
-    \leq
-    \frac{1}{t}
-    \E\left[
-      \sup_{g \in \cG_n^\delta}
-      \left|
-      \frac{1}{\sqrt{n}}
-      \sum_{i=1}^n
-      \Big(
-        g(X_i) - \E[g(X_i)]
-      \Big)
-      \right|
-    \right] \\
-    &\lesssim
-    \frac{l_{n,2} \delta_n}{t_n}
-    \sqrt{C_1 \log n}
-    + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n.
-  \end{align*}
-
-  \proofparagraph{regularity of $Z_n$}
-
-  Next we bound the fluctuations in the Gaussian process
-  $Z_n$.
-  Let $\rho$ be the following semimetric:
-  %
-  \begin{align*}
-    \rho(x, x')^2
-    &=
-    \E\big[\big( Z_n(x) - Z_n(x') \big)^2\big]
-    =
-    \E\big[\big( G_n(x) - G_n(x') \big)^2\big] \\
-    &=
-    \frac{1}{n}
-    \sum_{i=1}^n
-    \E\big[\big( h_i(x) - h_i(x') \big)^2\big]
-    \leq
-    l_{n,2}^2 \, |x - x'|^2.
-  \end{align*}
-  %
-  Hence
-  $\rho(x, x')
-  \leq
-  l_{n,2} \, |x - x'|$.
-  By
-  the Gaussian process maximal inequality from
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  we obtain that
-  %
-  \begin{align*}
-    &\E\bigg[
-      \sup_{|x - x'| \leq \delta_n}
-      \big|
-      Z_n(x) - Z_n(x')
-      \big|
-    \bigg]
-    \lesssim
-    \E\bigg[
-      \sup_{\rho(x,x') \leq l_{n,2} \delta_n}
-      \big|
-      Z_n(x) - Z_n(x')
-      \big|
-    \bigg] \\
-    &\quad\leq
-    \int_0^{l_{n,2} \delta_n}
-    \sqrt{\log N(\varepsilon, \cX_n, \rho)}
-    \diff{\varepsilon}
-    \leq
-    \int_0^{l_{n,2} \delta_n}
-    \sqrt{\log N(\varepsilon / l_{n,2}, \cX_n, |\cdot|)}
-    \diff{\varepsilon} \\
-    &\quad\leq
-    \int_0^{l_{n,2} \delta_n}
-    \sqrt{\log \left( 1 + \frac{\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)}
-    \diff{\varepsilon}
-    \leq
-    \int_0^{l_{n,2} \delta_n}
-    \sqrt{\log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)}
-    \diff{\varepsilon} \\
-    &\quad\leq
-    \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
-    \int_0^{l_{n,2} \delta_n}
-    \log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)
-    \diff{\varepsilon} \\
-    &\quad=
-    \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
-    \left(
-      l_{n,2} \delta_n \log \left( 2 \Leb(\cX_n) l_{n,2} \right)
-      + l_{n,2} \delta_n
-      + l_{n,2} \delta_n \log \left( \frac{1}{l_{n,2} \delta_n} \right)
-    \right) \\
-    &\quad=
-    \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
-    l_{n,2} \delta_n
-    \left(
-      1 +
-      \log \left( \frac{2\Leb(\cX_n)}{\delta_n} \right)
-    \right)
-    \lesssim
-    l_{n,2} \delta_n
-    \sqrt{\log \left( \frac{\Leb(\cX_n)}{\delta_n} \right)} \\
-    &\quad\lesssim
-    l_{n,2} \delta_n
-    \sqrt{C_1 \log n},
-  \end{align*}
-  %
-  where we used that $\delta_n \leq \Leb(\cX_n)$.
-  So by Markov's inequality,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{|x - x'| \leq \delta_n}
-      \big|
-      Z_n(x) - Z_n(x')
-      \big|
-      > t_n
-    \right)
-    &\lesssim
-    t_n^{-1}
-    l_{n,2} \delta_n
-    \sqrt{C_1 \log n}.
-  \end{align*}
-
-  \proofparagraph{conclusion}
-
-  By the results of the previous parts,
-  we have up to universal constants that
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{x \in \cX_n}
-      \big|
-      G_n(x) - Z_n(x)
-      \big|
-      > t_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{x \in \cX_n^\delta}
-      \big|
-      G_n(x) - Z_n(x)
-      \big|
-      > t_n / 3
-    \right)
-    + \P\left(
-      \sup_{|x-x'| \leq \delta_n}
-      \big|
-      G_n(x) - G_n(x')
-      \big|
-      > t_n / 3
-    \right) \\
-    &\qquad+
-    \P\left(
-      \sup_{|x - x'| \leq \delta_n}
-      \big|
-      Z_n(x) - Z_n(x')
-      \big|
-      > t_n / 3
-    \right) \\
-    &\quad\lesssim
-    \frac{4 \Leb(\cX_n)}{\delta_n}
-    e^{-s^2/2}
-    + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
-    \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\
-    &\qquad+
-    \frac{l_{n,2} \delta_n}{t_n}
-    \sqrt{C_1 \log n}
-    + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n.
-  \end{align*}
-  %
-  Choosing an approximately optimal mesh size of
-  %
-  \begin{align*}
-    \delta_n
-    &=
-    \sqrt{
-      \frac{\sigma_n^2 \Leb(\cX_n) \log n}{\sqrt n t_n^3}
-      \Big(M_n + \sigma_n \sqrt{\log n}\Big)
-    } \Bigg/
-    \sqrt{
-      t_n^{-1}
-      l_{n,2}
-      \sqrt{\log n}
-      \left(
-        1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt{n}}
-      \right)
-    }
-  \end{align*}
-  %
-  gives $\log |\delta_n| \lesssim C_1 \log n$ for a universal constant,
-  so with $s$ a large enough multiple of $\sqrt{\log n}$,
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{x \in \cX_n}
-      \big|
-      G_n(x) - Z_n(x)
-      \big|
-      > t_n
-    \right) \\
-    &\quad\lesssim
-    \frac{4 \Leb(\cX_n)}{\delta_n}
-    e^{-s^2/2}
-    + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
-    \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\
-    &\qquad+
-    \frac{l_{n,2} \delta_n}{t_n}
-    \sqrt{C_1 \log n}
-    + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n \\
-    &\quad\lesssim
-    \delta_n
-    \frac{l_{n,2} \sqrt {\log n}}{t_n}
-    \left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt n} \right) \\
-    &\quad\lesssim
-    \frac{\sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n}
-    \sqrt{M_n + \sigma_n \sqrt{\log n}}}
-    {n^{1/4} t_n^2}
-    \sqrt{l_{n,2} \sqrt {\log n}
-    + \frac{l_{n,\infty}}{\sqrt n} \log n}.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_vbp}]
-
-  The proof is by induction on the number of vertices in the tree.
-  Let $\cT$ have $n$ vertices,
-  and suppose that vertex $n$ is a leaf
-  connected to vertex $n-1$ by an edge,
-  relabeling the vertices if necessary.
-  By the induction hypothesis we assume that there is a
-  probability measure $\P^{(n-1)}$
-  on $\prod_{i=1}^{n-1} \cX_i$
-  whose projections onto $\cX_i$ are $\P_i$
-  and whose projections onto $\cX_i \times \cX_j$ are $\P_{i j}$,
-  for $i,j \leq n-1$.
-  Now apply the original
-  Vorob'ev--Berkes--Philipp theorem,
-  which can be found as Theorem~1.1.10 in
-  \citet{dudley1999uniform},
-  to the spaces
-  $\prod_{i=1}^{n-2} \cX_i$,\,
-  $\cX_{n-1}$, and
-  $\cX_n$;
-  and to the laws
-  $\P^{(n-1)}$
-  and
-  $\P_{n-1, n}$.
-  This gives a law $\P^{(n)}$
-  which agrees with $\P_i$
-  at every vertex by definition,
-  and agrees with
-  $\P_{i j}$ for all $i,j \leq n-1$.
-  It also agrees with $\P_{n-1,n}$,
-  and this is the only edge touching vertex $n$.
-  Hence $\P^{(n)}$ satisfies the desired properties.
-\end{proof}
-
-\subsection{Main results}
-\label{sec:kernel_app_main}
-
-We give supplementary details for our main results on consistency, minimax
-optimality, strong approximation, covariance estimation, feasible inference and
-counterfactual estimation.
-We begin with a basic fact about Lipschitz functions.
-
-\begin{lemma}[Lipschitz kernels are bounded]
-  \label{lem:kernel_app_lipschitz_kernels_bounded}
-
-  Let $\cX \subseteq \R$ be a connected set.
-  Let $f: \cX \to \R$ satisfy the Lipschitz condition
-  $|f(x) - f(x')| \leq C |x-x'|$ for some $C > 0$
-  and all $x, x' \in \cX$.
-  Suppose also that $f$ is a kernel in the sense that
-  $\int_\cX f(x) \diff{x} = 1$.
-  Then we have
-  %
-  \begin{align*}
-    \sup_{x \in \cX} |f(x)|
-    &\leq
-    C \Leb(\cX) + \frac{1}{\Leb(\cX)}.
-  \end{align*}
-  %
-  Now let $g: \cX \to [0,\infty)$ satisfy
-  $|g(x) - g(x')| \leq C |x-x'|$ for some $C > 0$
-  and all $x, x' \in \cX$.
-  Suppose $g$ is a sub-kernel with
-  $\int_\cX g(x) \diff{x} \leq 1$.
-  Then for any $M \in \big(0, \Leb(\cX)\big]$,
-  we have
-  %
-  \begin{align*}
-    \sup_{x \in \cX} f(x)
-    &\leq
-    C M + \frac{1}{M}.
-  \end{align*}
-
-\end{lemma}
-
-Applying Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
-to the density and kernel functions defined in
-Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
-yields the following.
-Firstly, since $k_h(\cdot, w)$ is $C_\rL / h^2$-Lipschitz
-on $[w \pm h] \cap \cW$ and integrates to one,
-we have by the first inequality in
-Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} that
-%
-\begin{align*}
-  |k_h(s,w)|
-  &\leq \frac{2 C_\rL + 1}{h} + \frac{1}{\Leb(\cW)}.
-\end{align*}
-%
-Since each of
-$f_{W \mid AA}(\cdot \mid a,a')$,
-$f_{W \mid A}(\cdot \mid a)$, and
-$f_W$ is non-negative, and $C_\rH$-Lipschitz on $\cW$
-and integrates to at most one over $\cW$,
-taking $M = \frac{1}{\sqrt{C_\rH}} \wedge \Leb(\cW)$
-in the second inequality in
-Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
-gives
-%
-\begin{align*}
-  f_{W \mid AA}(w \mid a,a')
-  &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\
-  f_{W \mid A}(w \mid a)
-  &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\
-  f_W(w)
-  &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}.
-\end{align*}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}]
-
-  We begin with the first inequality.
-  Note that if $\Leb(\cX) = \infty$ there is nothing to prove.
-  Suppose for contradiction that
-  $|f(x)| > C \Leb(\cX) + \frac{1}{\Leb(\cX)}$
-  for some $x \in \cX$.
-  If $f(x) \geq 0$
-  then by the Lipschitz property, for any $y \in \cX$,
-  %
-  \begin{align*}
-    f(y)
-    \geq f(x) - C|y-x|
-    > C \Leb(\cX) + \frac{1}{\Leb(\cX)} - C\Leb(\cX)
-    = \frac{1}{\Leb(\cX)}.
-  \end{align*}
-  %
-  Similarly, if $f(x) \leq 0$ then
-  %
-  \begin{align*}
-    f(y)
-    \leq f(x) + C|y-x|
-    < - C \Leb(\cX) - \frac{1}{\Leb(\cX)} + C\Leb(\cX)
-    = -\frac{1}{\Leb(\cX)}.
-  \end{align*}
-  %
-  But then either
-  $\int_\cX f(x) \diff{x} > \int_\cX 1/\Leb(\cX) \diff{x} = 1$
-  or
-  $\int_\cX f(x) \diff{x} < \int_\cX -1/\Leb(\cX) \diff{x} = -1 < 1$,
-  giving a contradiction.
-
-  For the second inequality,
-  assume that $f$ is non-negative on $\cX$,
-  and take $M \in \big(0, \Leb(\cX)\big]$.
-  Suppose for contradiction that
-  $f(x) > C M + \frac{1}{M}$
-  for some $x \in \cX$.
-  Then by the Lipschitz property, $f(y) \geq 1/M$
-  for all $y$ such that $|y - x| \leq M$.
-  Since $\cX$ is connected, we have
-  $\Leb(\cX \cap [x \pm M]) \geq M$
-  and so we deduce that
-  $\int_\cX f(x) \diff{x} > M/M = 1$
-  which is a contradiction.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_bias}]
-
-  Begin by defining
-  %
-  \begin{align*}
-    P_p(s,w)
-    &=
-    \sum_{r = 0}^p
-    \frac{f_W^{(r)}(w)}{r!}
-    {(s-w)^r}
-  \end{align*}
-  %
-  for $s, w \in \cW$
-  as the degree-$p$ Taylor polynomial of $f_W$,
-  centered at $w$ and evaluated at $s$.
-  Note that
-  for $p \leq \flbeta-1$,
-  by Taylor's theorem with Lagrange remainder,
-  %
-  \begin{align*}
-    f_W(s) - P_p(s,w)
-    &=
-    \frac{f_W^{(p+1)}(w')}{(p+1)!}
-    (s-w)^{p+1}
-  \end{align*}
-  %
-  for some $w'$ between $w$ and $s$.
-  Also note that for any $p$,
-  %
-  \begin{align*}
-    \int_{\cW}
-    k_h(s,w)
-    \big(
-      P_p(s,w)
-      - P_{p-1}(s,w)
-    \big)
-    \diff{s}
-    &=
-    \int_{\cW}
-    k_h(s,w)
-    \frac{f_W^{(p)}(w)}{p!}
-    (s-w)^p
-    \diff{s}
-    = h^p b_p(w).
-  \end{align*}
-  %
-  Further, by the order of the kernel,
-  %
-  \begin{align*}
-    \E\big[\hat f_W(w)\big]
-    - f_W(w)
-    &=
-    \int_{\cW}
-    k_h(s,w)
-    f_W(s)
-    \diff{s}
-    - f_W(w)
-    =
-    \int_{\cW}
-    k_h(s,w)
-    \big(f_W(s) - f_W(w)\big)
-    \diff{s} \\
-    &=
-    \int_{\cW}
-    k_h(s,w)
-    \big(f_W(s) - P_{p-1}(s,w)\big)
-    \diff{s}.
-  \end{align*}
-
-  \proofparagraph{low-order kernel}
-  Suppose that $p \leq \flbeta - 1$. Then
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \big|
-    \E[\hat f_W(w)]
-    - f_W(w)
-    - h^p b_p(w)
-    \big| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(f_W(s) - P_{p-1}(s,w)\big)
-    \diff{s}
-    - h^p b_p(w)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(
-      f_W(s) - P_{p}(s,w)
-      + P_{p}(s,w) - P_{p-1}(s,w)
-    \big)
-    \diff{s}
-    - h^p b_p(w)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(
-      f_W(s) - P_{p}(s,w)
-    \big)
-    \diff{s}
-    \right|
-    = \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \frac{f_W^{(p+1)}(w')}{(p+1)!}
-    (s-w)^{p+1}
-    \diff{s}
-    \right| \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \left|
-    \int_{[w \pm h]}
-    \frac{C_\rk}{h}
-    \frac{C_\rH}{(p+1)!}
-    h^{p+1}
-    \diff{s}
-    \right|
-    \leq
-    \frac{2C_\rk C_\rH}{(p+1)!}
-    h^{p+1}.
-  \end{align*}
-
-  \proofparagraph{order of kernel matches smoothness}
-  Suppose that $p = \flbeta$.
-  Then
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \big|
-    \E[\hat f_W(w)]
-    - f_W(w)
-    - h^p b_p(w)
-    \big| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(f_W(s) - P_{\flbeta - 1}(s,w)\big)
-    \diff{s}
-    - h^p b_p(w)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(
-      f_W(s) - P_{\flbeta}(s,w)
-      + P_{\flbeta}(s,w) - P_{\flbeta - 1}(s,w)
-    \big)
-    \diff{s}
-    - h^{\flbeta} b_{\flbeta}(w)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \big(
-      f_W(s) - P_{\flbeta}(s,w)
-    \big)
-    \diff{s}
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    \frac{f_W^{(\flbeta)}(w') - f_W^{(\flbeta)}(w)}{\flbeta!}
-    (s-w)^{\flbeta}
-    \diff{s}
-    \right| \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \left|
-    \int_{[w \pm h]}
-    \frac{C_\rk}{h}
-    \frac{C_\rH h^{\beta - \flbeta}}{\flbeta !}
-    h^{\flbeta}
-    \diff{s}
-    \right|
-    \leq
-    \frac{2 C_\rk C_\rH}{\flbeta !}
-    h^\beta.
-  \end{align*}
-
-  \proofparagraph{high-order kernel}
-  Suppose that $p \geq \flbeta+1$.
-  Then as in the previous part
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \big|
-    \E[\hat f_W(w)]
-    - f_W(w)
-    \big|
-    &=
-    \sup_{w \in \cW}
-    \left|
-    \int_{[w \pm h] \cap \cW}
-    \!\!\!\! k_h(s,w)
-    \big(
-      f_W(s) - P_{\flbeta}(s,w)
-    \big)
-    \diff{s}
-    \right|
-    \leq
-    \frac{2 C_\rk C_\rH}{\flbeta !}
-    h^\beta.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_hoeffding}]
-
-  \proofparagraph{Hoeffding-type decomposition}
-
-  \begin{align*}
-    \hat f_W(w)
-    - E_n(w)
-    - \E[\hat f_W(w)]
-    &=
-    \frac{2}{n(n-1)}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^{n}
-    \Big(
-      \E[k_h(W_{i j},w) \mid A_i, A_j]
-      - \E[k_h(W_{i j},w)]
-    \Big) \\
-    &=
-    \frac{1}{n(n-1)}
-    \sum_{i=1}^{n-1}
-    \sum_{j \neq i}
-    \Big(
-      \E[k_h(W_{i j},w) \mid A_i, A_j]
-      - \E[k_h(W_{i j},w)]
-    \Big),
-  \end{align*}
-  %
-  and apply Lemma~\ref{lem:kernel_app_general_hoeffding} with
-  %
-  \begin{align*}
-    u_{i j}
-    &=
-    \frac{1}{n(n-1)}
-    \E\big[k_h(W_{i j},w) \mid A_i, A_j\big],
-    &u_i
-    &=
-    \frac{1}{n(n-1)}
-    \E\big[k_h(W_{i j},w) \mid A_i\big], \\
-    u
-    &=
-    \frac{1}{n(n-1)}
-    \E\big[k_h(W_{i j},w)\big],
-  \end{align*}
-  %
-  to see
-  %
-  \begin{align*}
-    \hat f_W(w)
-    - E_n(w)
-    - \E[\hat f_W(w)]
-    &=
-    \frac{2}{n}
-    \sum_{i=1}^n
-    \big(u_i - u\big)
-    + \frac{1}{n(n-1)}
-    \sum_{i=1}^n
-    \sum_{j \neq i}
-    \big(
-      u_{i j} - u_i - u_j + u
-    \big) \\
-    &=
-    \frac{2}{n}
-    \sum_{i=1}^n
-    l_i(w)
-    + \frac{2}{n(n-1)}
-    \sum_{i=1}^n
-    \sum_{j = i+1}^n
-    q_{i j}(w)
-    =
-    L_n + Q_n.
-  \end{align*}
-
-  \proofparagraph{expectation and covariance of $L_n$, $Q_n$, and $E_n$}
-
-  $L_n$, $Q_n$, and $E_n$
-  are clearly mean-zero.
-  For orthogonality,
-  note that their summands
-  have the following properties,
-  for any $1 \leq i < j \leq n$
-  and $1 \leq r < s \leq n$,
-  and for any $w, w' \in \cW$:
-  %
-  \begin{align*}
-    \E\big[
-      l_i(w)
-      q_{rs}(w')
-    \big]
-    &=
-    \E\big[
-      l_i(w)
-      \E\big[
-        q_{rs}(w') \mid A_i
-      \big]
-    \big]
-    = 0, \\
-    \E\big[
-      l_i(w)
-      e_{rs}(w')
-    \big]
-    &=
-    \begin{cases}
-      \E\big[
-        l_i(w)
-      \big]
-      \E\big[
-        e_{rs}(w')
-      \big],
-      \text{ if } i \notin \{r,s\}, \\
-      \E\big[
-        l_i(w)
-        \E\big[
-          e_{rs}(w') \mid A_r, A_s
-        \big]
-      \big],
-      \text{ if } i \in \{r,s\},
-    \end{cases} \\
-    &=
-    0, \\
-    \E\big[
-      q_{i j}(w)
-      e_{rs}(w')
-    \big]
-    &=
-    \begin{cases}
-      \E\big[
-        q_{i j}(w)
-      \big]
-      \E\big[
-        e_{rs}(w')
-      \big],
-      \text{ if } \{i,j\} \cap \{r,s\} = \emptyset, \\
-      \E\big[
-        \E\big[
-          q_{i j}(w) \mid A_i
-        \big]
-        \E\big[
-          e_{rs}(w') \mid A_i
-        \big]
-      \big],
-      \text{ if } \{i,j\} \cap \{r,s\} = \{i\}, \\
-      \E\big[
-        \E\big[
-          q_{i j}(w) \mid A_j
-        \big]
-        \E\big[
-          e_{rs}(w') \mid A_j
-        \big]
-      \big],
-      \text{ if } \{i,j\} \cap \{r,s\} = \{j\}, \\
-      \E\big[
-        q_{i j}(w)
-        \E\big[
-          e_{rs}(w') \mid A_r, A_s
-        \big]
-      \big],
-      \text{ if } \{i,j\} = \{r,s\},
-    \end{cases} \\
-    &=
-    0,
-  \end{align*}
-  %
-  by independence of $\bA_n$ and $\bV_n$
-  and as $\E[q_{rs}(w) \mid A_i] = 0$
-  and $\E[e_{i j}(w) \mid A_i, A_j] = 0$.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_trichotomy}]
-
-  \proofparagraph{total degeneracy}
-
-  Suppose
-  $\Dl = 0$, so
-  $\Var[f_{W \mid A}(w \mid A_i)] = 0$
-  for all $w \in \cW$.
-  Therefore, for all $w \in \cW$,
-  we have $f_{W \mid A}(w) = f_W(w)$ almost surely.
-  By taking a union over $\cW \cap \Q$
-  and by continuity of $f_{W \mid A}$ and $f_W$,
-  this implies that $f_{W \mid A}(w) = f_W(w)$
-  for all $w \in \cW$
-  almost surely. Thus
-  %
-  \begin{align*}
-    \E\left[
-      k_h(W_{i j},w) \mid A_i
-    \right]
-    &=
-    \int_{\cW}
-    k_h(s,w)
-    f_{W \mid A}(s \mid A_i)
-    \diff{s}
-    =
-    \int_{\cW}
-    k_h(s,w)
-    f_W(s)
-    \diff{s}
-    =
-    \E\left[
-      k_h(W_{i j},w)
-    \right]
-  \end{align*}
-  %
-  for all $w \in \cW$ almost surely.
-  Hence $l_i(w) = 0$ and so $L_n(w) = 0$
-  for all $w \in \cW$ almost surely.
-
-  \proofparagraph{no degeneracy}
-
-  Suppose $\Dl > 0$.
-  As $f_{W|A}(\cdot \mid a)$ is $C_\rH$-Lipschitz
-  for all $a \in \cA$ and since $|k_h| \leq C_\rk/h$,
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left|
-    \E[k_h(W_{i j},w) \mid A_i]
-    - f_{W \mid A}(w \mid A_i)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW}
-    k_h(s,w)
-    f_{W \mid A}(s \mid A_i)
-    \diff{s}
-    - f_{W \mid A}(w \mid A_i)
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \int_{\cW \cap [w \pm h]}
-    k_h(s,w)
-    \left(
-      f_{W \mid A}(s \mid A_i)
-      - f_{W \mid A}(w \mid A_i)
-    \right)
-    \diff{s}
-    \right| \\
-    &\quad\leq
-    2h
-    \frac{C_\rk}{h}
-    C_\rH h
-    \leq
-    2 C_\rk C_\rH h
-  \end{align*}
-  %
-  almost surely.
-  Therefore, since $f_{W \mid A}(w \mid a) \leq C_\rd$,
-  we have
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    \Var\big[
-      \E[k_h(W_{i j},w) \mid A_i]
-    \big]
-    - \Var\left[
-      f_{W \mid A}(w \mid A_i)
-    \right]
-    \right|
-    &\leq
-    16 C_\rk C_\rH C_\rd h
-  \end{align*}
-  %
-  whenever $h$ is small enough that
-  $2 C_\rk C_\rH h \leq C_\rd$. Thus
-  %
-  \begin{align*}
-    \inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
-    &\geq
-    \inf_{w \in \cW}\Var[f_{W \mid A}(w \mid A_i)]
-    - 16 C_\rk C_\rH C_\rd h.
-  \end{align*}
-  %
-  Therefore, if $\Dl > 0$, then eventually
-  $\inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \Dl/2$.
-  Finally,
-  %
-  \begin{align*}
-    \inf_{w \in \cW}\Var[L_n(w)]
-    &=
-    \frac{4}{n}
-    \inf_{w \in \cW}
-    \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
-    \geq
-    \frac{2 \Dl}{n}.
-  \end{align*}
-
-  \proofparagraph{partial degeneracy}
-
-  Since $f_{W \mid A}(w \mid A_i)$
-  is bounded by $C_\rd$ and $C_\rH$-Lipschitz in $w$,
-  we have that
-  $\Var[f_{W \mid A}(w \mid A_i)]$
-  is continuous on $\cW$.
-  Thus if $\Dl = 0$,
-  there is at least one point $w \in \cW$
-  for which
-  $\Var[f_{W \mid A}(w \mid A_i)] = 0$
-  by compactness.
-  Let $w$ be any such degenerate point.
-  Then by the previous part,
-  %
-  \begin{align*}
-    \Var[L_n(w)] =
-    \frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
-    &\leq
-    64 C_\rk C_\rH C_\rd \frac{h}{n}.
-  \end{align*}
-  %
-  If conversely $w$ is not a degenerate point
-  then
-  $\Var[f_{W \mid A}(w \mid A_i)] > 0$
-  so eventually
-  %
-  \begin{align*}
-    \Var[L_n(w)]
-    = \frac{4}{n}
-    \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
-    &\geq
-    \frac{2}{n}
-    \Var[f_{W \mid A}(w \mid A_i)].
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_uniform_concentration}]
-
-  We establish VC-type properties of function
-  classes and apply empirical process theory.
-
-  \proofparagraph{establishing VC-type classes}
-
-  Consider the following function classes:
-  %
-  \begin{align*}
-    \cF_1
-    &=
-    \Big\{
-      W_{i j} \mapsto
-      k_h(W_{i j},w)
-      : w \in \cW
-    \Big\}, \\
-    \cF_2
-    &=
-    \Big\{
-      (A_i, A_j) \mapsto
-      \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
-      : w \in \cW
-    \Big\}, \\
-    \cF_3
-    &=
-    \Big\{
-      A_i \mapsto
-      \E\big[ k_h(W_{i j},w) \mid A_i \big]
-      : w \in \cW
-    \Big\}.
-  \end{align*}
-  %
-  For $\cF_1$, take $0 < \varepsilon \leq \Leb(\cW)$
-  and $\cW_\varepsilon$ an $\varepsilon$-cover of $\cW$
-  of cardinality at most $\Leb(\cW)/\varepsilon$. As
-  %
-  \begin{align*}
-    \sup_{s, w, w' \in \cW}
-    \left|
-    \frac{k_h(s,w) - k_h(s,w')}
-    {w-w'}
-    \right|
-    &\leq
-    \frac{C_\mathrm{L}}{h^2}
-  \end{align*}
-  %
-  almost surely,
-  we see that
-  %
-  \begin{align*}
-    \sup_\Q
-    N\left(\cF_1, \rho_\Q,
-    \frac{C_\mathrm{L}}{h^2} \varepsilon \right)
-    &\leq
-    N\left(\cF_1, \|\cdot\|_\infty,
-    \frac{C_\mathrm{L}}{h^2} \varepsilon \right)
-    \leq
-    \frac{\Leb(\cW)}{\varepsilon},
-  \end{align*}
-  %
-  where $\Q$ ranges over Borel
-  probability measures on $\cW$.
-  Since
-  $\frac{C_\rk}{h}$
-  is an envelope for $\cF_1$,
-  %
-  \begin{align*}
-    \sup_\Q
-    N\left(\cF_1, \rho_\Q,
-    \frac{C_\rk}{h} \varepsilon \right)
-    &\leq
-    \frac{C_\rL}{C_\rk}
-    \frac{\Leb(\cW)}{h \varepsilon}.
-  \end{align*}
-  %
-  Thus for all $\varepsilon \in (0,1]$,
-  %
-  \begin{align*}
-    \sup_\Q
-    N\left(\cF_1, \rho_\Q,
-    \frac{C_\rk}{h} \varepsilon \right)
-    &\leq
-    \frac{C_\rL}{C_\rk}
-    \frac{\Leb(\cW) \vee 1}{h \varepsilon}
-    \leq
-    (C_1/(h\varepsilon))^{C_2},
-  \end{align*}
-  %
-  where
-  $C_1 = \frac{C_\rL}{C_\rk} (\Leb(\cW) \vee 1)$
-  and $C_2 = 1$.
-  Next, $\cF_2$ forms a smoothly parameterized class of functions
-  since for $w,w' \in \cW$ we have
-  by the uniform Lipschitz properties of
-  $f_{W \mid AA}(\cdot \mid A_i, A_j)$ and
-  $k_h(s, \cdot)$,
-  with $|w-w'| \leq h$,
-  %
-  \begin{align*}
-    &\left|
-    \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
-    - \E\big[ k_h(W_{i j},w') \mid A_i, A_j \big]
-    \right| \\
-    &\quad=
-    \left|
-    \int_{[w \pm h] \cap \cW}
-    k_h(s,w)
-    f_{W \mid AA}(s \mid A_i, A_j)
-    \diff{s}
-    - \int_{[w' \pm h] \cap \cW}
-    k_h(s,w')
-    f_{W \mid AA}(s \mid A_i, A_j)
-    \diff{s}
-    \right| \\
-    &\quad=
-    \left|
-    \int_{[w \pm 2h] \cap \cW}
-    \big(
-      k_h(s,w)
-      - k_h(s,w')
-    \big)
-    f_{W \mid AA}(s \mid A_i, A_j)
-    \diff{s}
-    \right| \\
-    &\quad=
-    \left|
-    \int_{[w \pm 2h] \cap \cW}
-    \big(
-      k_h(s,w)
-      - k_h(s,w')
-    \big)
-    \big(
-      f_{W \mid AA}(s \mid A_i, A_j)
-      - f_{W \mid AA}(w \mid A_i, A_j)
-    \big)
-    \diff{s}
-    \right| \\
-    &\quad\leq
-    4h
-    \frac{C_\rL}{h^2}
-    |w-w'|
-    2 C_\rH h
-    \leq
-    8 C_\rL C_\rH
-    |w-w'|
-    \leq
-    C_3
-    |w-w'|,
-  \end{align*}
-  %
-  where $C_3 = 8 C_\rL C_\rH$.
-  The same holds for $|w-w'| > h$
-  as the Lipschitz property is local.
-  By taking $\E[\, \cdot \mid A_i]$,
-  it can be seen
-  by the contraction property of conditional expectation that
-  the same holds for the
-  singly-conditioned terms:
-  %
-  \begin{align*}
-    \left|
-    \E\big[ k_h(W_{i j},w) \mid A_i \big]
-    - \E\big[ k_h(W_{i j},w') \mid A_i \big]
-    \right|
-    &\leq
-    C_3
-    |w-w'|.
-  \end{align*}
-  %
-  Therefore $\cF_3$ is also smoothly parameterized
-  in exactly the same manner.
-  Let
-  %
-  \begin{align*}
-    C_4
-    &=
-    \sup_{w \in \cW}
-    \esssup_{A_i, A_j}
-    \big|
-    \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
-    \big| \\
-    &=
-    \sup_{w \in \cW}
-    \esssup_{A_i, A_j}
-    \left|
-    \int_{[w \pm h] \cap \cW}
-    k_h(s,w)
-    f_{W \mid AA}(s \mid A_i, A_j)
-    \diff{s}
-    \right| \\
-    &\leq 2h \frac{C_\rk}{h} C_\rd
-    \leq 2 C_\rk C_\rd.
-  \end{align*}
-  %
-  For $\varepsilon \in (0,1]$,
-  take an $(\varepsilon C_4/C_3)$-cover of $\cW$
-  of cardinality at most $C_3 \Leb(\cW) / (\varepsilon C_4)$.
-  By the above parameterization properties,
-  this cover induces an
-  $\varepsilon C_4$-cover for both $\cF_2$ and $\cF_3$:
-  %
-  \begin{align*}
-    \sup_\Q
-    N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big)
-    &\leq
-    N\big(\cF_2, \|\cdot\|_\infty, \varepsilon C_4 \big)
-    \leq
-    C_3 \Leb(\cW) / (\varepsilon C_4), \\
-    \sup_\Q
-    N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big)
-    &\leq
-    N\big(\cF_3, \|\cdot\|_\infty, \varepsilon C_4 \big)
-    \leq
-    C_3 \Leb(\cW) / (\varepsilon C_4).
-  \end{align*}
-  %
-  Hence $\cF_1$, $\cF_2$, and $\cF_3$
-  form VC-type classes with envelopes
-  $F_1 = C_\rk / h$ and $F_2 = F_3 = C_4$:
-  %
-  \begin{align*}
-    \sup_\Q
-    N\left(\cF_1, \rho_\Q,
-    \varepsilon C_\rk / h \right)
-    &\leq
-    (C_1/(h\varepsilon))^{C_2},
-    &\sup_\Q
-    N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big)
-    &\leq
-    (C_1/\varepsilon)^{C_2}, \\
-    \sup_\Q
-    N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big)
-    &\leq
-    (C_1/\varepsilon)^{C_2},
-  \end{align*}
-  %
-  for some constants $C_1 \geq e$ and $C_2 \geq 1$,
-  where we augment the constants if necessary.
-
-  \proofparagraph{controlling $L_n$}
-
-  Observe that
-  $\sqrt{n}L_n$
-  is the empirical process of the i.i.d.\ variables $A_i$
-  indexed by $\cF_3$.
-  We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid}
-  with $\sigma = C_4$:
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{w \in \cW}
-      \big| \sqrt{n} L_
-      n(w) \big|
-    \right]
-    &\lesssim
-    C_4
-    \sqrt{C_2 \log C_1}
-    +
-    \frac{C_4 C_2 \log C_1}
-    {\sqrt{n}}
-    \lesssim 1.
-  \end{align*}
-  %
-  By Lemma~\ref{lem:kernel_trichotomy},
-  the left hand side is zero whenever
-  $\Du = 0$,
-  so we can also write
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{w \in \cW}
-      \big| \sqrt{n} L_n(w) \big|
-    \right]
-    &\lesssim
-    \Du.
-  \end{align*}
-
-  \proofparagraph{controlling $Q_n$}
-
-  Observe that $n Q_n$
-  is the completely degenerate second-order U-process
-  of the i.i.d.\ variables $A_i$
-  indexed by $\cF_2$.
-  This function class is again uniformly bounded and VC-type,
-  so applying the U-process maximal inequality from
-  Lemma~\ref{lem:kernel_app_uprocess_maximal}
-  yields with $\sigma = C_4$
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{w \in \cW}
-      \big| n Q_n(w) \big|
-    \right]
-    &\lesssim
-    C_4
-    C_2 \log C_1
-    +
-    \frac{C_4 (C_2 \log C_1)^2}
-    {\sqrt{n}}
-    \lesssim 1.
-  \end{align*}
-
-  \proofparagraph{controlling $E_n$}
-
-  Conditional on $\bA_n$,
-  note that $n E_n$
-  is the empirical process of the conditionally
-  i.n.i.d.\ variables $W_{i j}$
-  indexed by $\cF_1$.
-  We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid}
-  conditionally with
-  %
-  \begin{align*}
-    \sigma^2
-    &=
-    \sup_{w \in \cW}
-    \E\Big[
-      \big(
-        k_h(W_{i j},w)
-        - \E[k_h(W_{i j},w) \mid A_i, A_j]
-      \big)^2
-      \mid A_i, A_j
-    \Big]
-    \leq
-    \sup_{w \in \cW}
-    \E\Big[
-      k_h(W_{i j},w)^2
-      \mid A_i, A_j
-    \Big] \\
-    &\leq
-    \sup_{w \in \cW}
-    \int_{[w \pm h] \cap \cW}
-    k_h(s,w)^2
-    f_{W \mid AA}(s \mid A_i, A_j)
-    \diff{s}
-    \leq 2h \frac{C_\rk^2}{h^2}
-    \lesssim 1/h
-  \end{align*}
-  %
-  and noting that we have
-  a sample size of
-  $\frac{1}{2}n(n-1)$,
-  giving
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{w \in \cW}
-      \big| n E_n(w) \big|
-    \right]
-    &\lesssim
-    \sigma
-    \sqrt{C_2 \log \big((C_1/h) F_1 / \sigma \big)}
-    +
-    \frac{F_1 C_2 \log \big((C_1/h) F_1 / \sigma\big)}
-    {n} \\
-    &\lesssim
-    \frac{1}{\sqrt h}
-    \sqrt{C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)}
-    +
-    \frac{(C_\rk/h) C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)}
-    {n} \\
-    &\lesssim
-    \sqrt{\frac{\log 1/h}{h}}
-    +
-    \frac{\log \big(1/h\big)}
-    {n h}
-    \lesssim
-    \sqrt{\frac{\log n}{h}},
-  \end{align*}
-  %
-  where the last line follows by the bandwidth assumption
-  of $\frac{\log n}{n^2h} \to 0$.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_uniform_consistency}]
-  This follows from Theorem~\ref{thm:kernel_bias}
-  and Lemma~\ref{lem:kernel_uniform_concentration}.
-\end{proof}
-
-Before proving Theorem~\ref{thm:kernel_minimax}
-we first give a lower bound result
-for parametric point estimation in
-Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}.
-
-\begin{lemma}[A Neyman--Pearson result for Bernoulli random variables]
-  \label{lem:kernel_app_neyman_pearson_bernoulli}
-
-  Recall that the Bernoulli distribution
-  $\Ber(\theta)$
-  places mass $\theta$ at $1$ and mass
-  $1-\theta$ at $0$.
-  Define $\P_\theta^n$ as the law of
-  $(A_1, A_2, \ldots, A_n, V)$,
-  where $A_1, \ldots, A_n$
-  are i.i.d.\ $\Ber(\theta)$,
-  and $V$ is an $\R^d$-valued random variable
-  for some $d \geq 1$
-  which is independent of the $A$ variables
-  and with a fixed distribution that does not depend on $\theta$.
-  Let $\theta_0 = \frac{1}{2}$
-  and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$.
-  Then for any estimator $\tilde \theta_n$
-  which is a function of
-  $(A_1, A_2, \ldots, A_n, V)$ only,
-  %
-  \begin{align*}
-    \P_{\theta_0}^n \left(
-      \big| \tilde \theta_n - \theta_0 \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    + \P_{\theta_{1,n}}^n \left(
-      \big| \tilde \theta_n - \theta_{1,n} \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    \geq \frac{1}{2}.
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}]
-
-  Let $f: \{0,1\}^n \to \{0,1\}$
-  be any function.
-  Considering this function as a statistical test,
-  the Neyman--Pearson lemma and Pinsker's inequality
-  \citep{gine2021mathematical}
-  give
-  %
-  \begin{align*}
-    \P_{\theta_0}^n \big(
-      f=1
-    \big)
-    +\P_{\theta_{1,n}}^n \big(
-      f=0
-    \big)
-    &\geq
-    1-
-    \TV\left(
-      \P_{\theta_0}^n,
-      \P_{\theta_{1,n}}^n
-    \right)
-    \geq
-    1-
-    \sqrt{
-      \frac{1}{2}
-      \KL \left(
-        \P_{\theta_0}^n
-        \bigm\|
-        \P_{\theta_{1,n}}^n
-    \right)} \\
-    &=
-    1-
-    \sqrt{
-      \frac{n}{2}
-      \KL \left(
-        \Ber(\theta_0)
-        \bigm\|
-        \Ber(\theta_{1,n})
-      \right)
-      + \frac{n}{2}
-      \KL \left(
-        V
-        \bigm\|
-        V
-    \right)} \\
-    &=
-    1-
-    \sqrt{
-      \frac{n}{2}
-      \KL \left(
-        \Ber(\theta_0)
-        \bigm\|
-        \Ber(\theta_{1,n})
-    \right)},
-  \end{align*}
-  %
-  where $\TV$ is the total variation distance
-  and $\KL$ is the Kullback--Leibler divergence.
-  In the penultimate line
-  we used the tensorization of Kullback--Leibler divergence
-  \citep{gine2021mathematical},
-  noting that the law of $V$ is fixed and hence does not contribute.
-  We now evaluate this Kullback--Leibler divergence at the specified
-  parameter values.
-  %
-  \begin{align*}
-    \P_{\theta_0}^n \big(
-      f=1
-    \big)
-    +\P_{\theta_{1,n}}^n \big(
-      f=0
-    \big)
-    &\geq
-    1-
-    \sqrt{
-      \frac{n}{2}
-      \KL \left(
-        \Ber(\theta_0)
-        \bigm\|
-        \Ber(\theta_{1,n})
-    \right)} \\
-    &=
-    1-
-    \sqrt{\frac{n}{2}}
-    \sqrt{
-      \theta_0 \log \frac{\theta_0}{\theta_{1,n}}
-    + (1 - \theta_0) \log \frac{1 - \theta_0}{1 - \theta_{1,n}}} \\
-    &=
-    1-
-    \sqrt{\frac{n}{2}}
-    \sqrt{
-      \frac{1}{2} \log \frac{1/2}{1/2 + 1/\sqrt{8n}}
-    + \frac{1}{2} \log \frac{1/2}{1/2 - 1/\sqrt{8n}}} \\
-    &=
-    1-
-    \frac{\sqrt n}{2}
-    \sqrt{\log \frac{1}{1 - 1/(2n)}}
-    \geq
-    1-
-    \frac{\sqrt n}{2}
-    \sqrt{\frac{1}{n}}
-    =
-    \frac{1}{2},
-  \end{align*}
-  %
-  where in the penultimate line we used that
-  $\log \frac{1}{1-x} \leq 2x$
-  for $x \in [0,1/2]$.
-  Now define a test $f$ by
-  $f = 1$ if $\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}$
-  and $f=0$ otherwise,
-  to see
-  %
-  \begin{align*}
-    \P_{\theta_0}^n \left(
-      \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}
-    \right)
-    + \P_{\theta_{1,n}}^n \left(
-      \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}}
-    \right)
-    \geq \frac{1}{2}.
-  \end{align*}
-  %
-  By the triangle inequality,
-  recalling that
-  $\theta_0 = \frac{1}{2}$
-  and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$,
-  we have
-  %
-  \begin{align*}
-    \left\{
-      \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}
-    \right\}
-    &\subseteq
-    \left\{
-      \left| \tilde \theta_n - \theta_0 \right|
-      \geq \frac{1}{\sqrt{32n}}
-    \right\} \\
-    \left\{
-      \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}}
-    \right\}
-    &\subseteq
-    \left\{
-      \left| \tilde \theta_n - \theta_{1,n} \right|
-      \geq \frac{1}{\sqrt{32n}}
-    \right\}.
-  \end{align*}
-  %
-  Thus by the monotonicity of measures,
-  %
-  \begin{align*}
-    \P_{\theta_0}^n \left(
-      \big| \tilde \theta_n - \theta_0 \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    + \P_{\theta_{1,n}}^n \left(
-      \big| \tilde \theta_n - \theta_{1,n} \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    \geq \frac{1}{2}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_minimax}]
-
-  \proofparagraph{lower bound for $\cP$}
-
-  By translation and scaling of the data,
-  we may assume without loss of generality that $\cW = [-1,1]$.
-  We may also assume that $C_\rH \leq 1/2$,
-  since reducing $C_\rH$ can only shrink the class of distributions.
-  Define the dyadic distribution $\P_\theta$
-  with parameter $\theta \in [1/2, 1]$
-  as follows:
-  $A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, while
-  $V_{i j}$ for $1 \leq i < j \leq n$ are i.i.d.\
-  and independent of $\bA_n$.
-  The distribution of $V_{i j}$ is given by its density function
-  $f_V(v) = \frac{1}{2} + C_\rH v$ on $[-1,1]$.
-  Finally, generate
-  $W_{i j} = W(A_i, A_j, V_{i j}) \vcentcolon=
-  (2 A_i A_j - 1) V_{i j}$.
-  Note that the function $W$ does not depend on $\theta$.
-  The conditional and marginal densities of $W_{i j}$ are
-  for $w \in [-1,1]$
-  %
-  \begin{align*}
-    f_{W \mid AA}(w \mid A_i, A_j)
-    &=
-    \begin{cases}
-      \frac{1}{2} + C_\rH w & \text{if } A_i = A_j = 1, \\
-      \frac{1}{2} - C_\rH w & \text{if } A_i = 0 \text{ or } A_j = 0, \\
-    \end{cases} \\
-    f_{W \mid A}(w \mid A_i)
-    &=
-    \begin{cases}
-      \frac{1}{2} + (2 \theta - 1) C_\rH w
-      & \text{if } A_i = 1, \\
-      \frac{1}{2} - C_\rH w & \text{if } A_i = 0 , \\
-    \end{cases} \\
-    f_W(w)&= \frac{1}{2} + (2\theta^2 - 1) C_\rH w.
-  \end{align*}
-  %
-  Clearly,
-  $f_W \in \cH^\beta_{C_\rH}(\cW)$ and
-  $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$.
-  Also
-  $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV \leq 1$.
-  Therefore
-  $\P_\theta$ satisfies Assumption~\ref{ass:kernel_data}
-  and so
-  $\big\{\P_\theta : \theta \in [1/2, 1] \big\} \subseteq \cP$.
-
-  Note that $f_W(1) = \frac{1}{2} + (2\theta^2 - 1) C_\rH $,
-  so $\theta^2 = \frac{1}{2 C_\rH}(f_W(1) - 1/2 + C_\rH)$.
-  Thus if $\tilde f_W$ is some density estimator
-  depending only on the data $\bW_n$,
-  we define the parameter estimator
-  %
-  \begin{align*}
-    \tilde \theta_n^2
-    &\vcentcolon=
-    \frac{1}{2 C_\rH}\left(
-      \tilde f_W(1) - \frac{1}{2} + C_\rH
-    \right)
-    \vee 0.
-  \end{align*}
-  %
-  This gives the inequality
-  %
-  \begin{align*}
-    \big|
-    \tilde \theta_n^2 - \theta^2
-    \big|
-    &=
-    \left|
-    \frac{1}{2 C_\rH}\left(
-      \tilde f_W(1) - \frac{1}{2} + C_\rH
-    \right)
-    \vee 0
-    -
-    \frac{1}{2 C_\rH}\left(
-      f_W(1) - \frac{1}{2} + C_\rH
-    \right)
-    \right| \\
-    &\leq
-    \frac{1}{2 C_\rH}
-    \sup_{w \in \cW}
-    \left|
-    \tilde f_W(w) - f_W(w)
-    \right|.
-  \end{align*}
-  %
-  Therefore, since also $\tilde \theta \geq 0$
-  and $\theta \geq \frac{1}{2}$,
-  %
-  \begin{align*}
-    \big|
-    \tilde \theta_n - \theta
-    \big|
-    &=
-    \frac{\big|\tilde \theta_n^2 - \theta^2\big|}
-    {\tilde \theta_n + \theta}
-    \leq
-    \frac{1}{C_\rH}
-    \sup_{w \in \cW}
-    \left|
-    \tilde f_W(w) - f_W(w)
-    \right|.
-  \end{align*}
-  %
-  Now we apply the point estimation lower bound from
-  Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli},
-  setting $\theta_0 = \frac{1}{2}$
-  and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$,
-  noting that the estimator
-  $\tilde \theta_n$
-  is a function of $\bW_n$ only,
-  thus is a function of $\bA_n$ and
-  $\bV_n$ only and so satisfies the conditions.
-  %
-  \begin{align*}
-    &\P_{\theta_0} \left(
-      \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(0)}_W(w) \big|
-      \geq \frac{1}{C\sqrt{n}}
-    \right)
-    + \P_{\theta_{1,n}} \left(
-      \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(1)}_W(w) \big|
-      \geq \frac{1}{C\sqrt{n}}
-    \right) \\
-    &\quad\geq
-    \P_{\theta_0} \left(
-      \big| \tilde \theta_n - \theta_0 \big|
-      \geq \frac{1}{C C_\rH \sqrt{n}}
-    \right)
-    + \P_{\theta_{1,n}} \left(
-      \big| \tilde \theta_n - \theta_{1,n} \big|
-      \geq \frac{1}{C C_\rH \sqrt{n}}
-    \right) \\
-    &\quad\geq
-    \P_{\theta_0} \left(
-      \big| \tilde \theta_n - \theta_0 \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    + \P_{\theta_{1,n}} \left(
-      \big| \tilde \theta_n - \theta_{1,n} \big|
-      \geq \frac{1}{\sqrt{32n}}
-    \right)
-    \geq
-    \frac{1}{2},
-  \end{align*}
-  %
-  where we set $C \geq \frac{\sqrt{32}}{C_\rH}$.
-  Therefore we deduce that
-  %
-  \begin{align*}
-    \inf_{\tilde f_W}
-    \sup_{\P \in \cP}
-    \P\left(
-      \sup_{w \in \cW}
-      \big|
-      \tilde f_W(w) - f_W(w)
-      \big|
-      \geq
-      \frac{1}{C \sqrt n}
-    \right)
-    \geq \frac{1}{4}
-  \end{align*}
-  %
-  and so
-  %
-  \begin{align*}
-    \inf_{\tilde f_W}
-    \sup_{\P \in \cP}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|
-      \tilde f_W(w) - f_W(w)
-      \big|
-    \right]
-    \geq \frac{1}{4 C \sqrt{n}}.
-  \end{align*}
-
-  \proofparagraph{lower bound for $\cP_\rd$}
-
-  For the subclass of totally degenerate distributions,
-  we rely on the main theorem
-  from \citet{khasminskii1978lower}.
-  Let $\cP_0$ be the subclass of $\cP_\rd$
-  consisting of the distributions which satisfy
-  $A_1 = \cdots = A_n = 0$
-  and $W_{i j} \vcentcolon= A_i + A_j + V_{i j} = V_{i j}$,
-  so that $W_{i j}$ are i.i.d.\ with common density $f_W = f_V$.
-  Define the class
-  %
-  \begin{align*}
-    \cF
-    &=
-    \left\{
-      f \text{ density function on } \R, \
-      f \in \cH^\beta_{C_\rH}(\cW)
-    \right\}.
-  \end{align*}
-  %
-  Write $\E_f$ for the expectation under $W_{i j}$ having density $f$.
-  Then by \citet{khasminskii1978lower},
-  %
-  \begin{align*}
-    \liminf_{n \to \infty}
-    \inf_{\tilde f_W}
-    \sup_{f \in \cF}
-    \E_f\left[
-      \left( \frac{n^2}{\log n} \right)^{\frac{\beta}{2\beta + 1}}
-      \sup_{w \in \cW}
-      \big| \tilde f_W(w) - f_W(w) \big|
-    \right]
-    > 0,
-  \end{align*}
-  %
-  where $\tilde f_W$ is any
-  density estimator
-  depending only on the $\frac{1}{2}n(n-1)$ i.i.d.\ data samples $\bW_n$.
-  Now every density function in
-  $\cH^\beta_{C_\rH}(\cW)$
-  corresponds to a distribution in
-  $\cP_0$ and therefore to a distribution in $\cP_\rd$.
-  Thus for large enough $n$ and
-  some positive constant $C$,
-  %
-  \begin{align*}
-    \inf_{\tilde f_W}
-    \sup_{\P \in \cP_\rd}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big| \tilde f_W(w) - f_W(w) \big|
-    \right]
-    \geq
-    \frac{1}{C}
-    \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta + 1}}.
-  \end{align*}
-
-  \proofparagraph{upper bounds}
-
-  The upper bounds follow by
-  using a dyadic kernel density estimator $\hat f_W$
-  with a boundary bias-corrected
-  Lipschitz kernel of order $p \geq \beta$ and a bandwidth of $h$.
-  Theorem~\ref{thm:kernel_bias} gives
-  %
-  \begin{align*}
-    \sup_{\P \in \cP}
-    \sup_{w \in \cW}
-    \big|
-    \E_\P\big[\hat f_W(w)\big]
-    - f_W(w)
-    \big|
-    \leq
-    \frac{4C_\rk C_\rH}{\flbeta !}
-    h^\beta.
-  \end{align*}
-  %
-  Then,
-  treating the degenerate and non-degenerate cases separately
-  and noting that all inequalities hold uniformly over
-  $\cP$ and $\cP_\rd$,
-  the proof of Lemma~\ref{lem:kernel_uniform_concentration}
-  shows that
-  %
-  \begin{align*}
-    \sup_{\P \in \cP}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big|
-    \right]
-    &\lesssim
-    \frac{1}{\sqrt n}
-    + \sqrt{\frac{\log n}{n^2h}}, \\
-    \sup_{\P \in \cP_\rd}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big|
-    \right]
-    &\lesssim
-    \sqrt{\frac{\log n}{n^2h}}.
-  \end{align*}
-  %
-  Thus combining these yields that
-  %
-  \begin{align*}
-    \sup_{\P \in \cP}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - f_W(w)\big|
-    \right]
-    &\lesssim
-    h^\beta
-    + \frac{1}{\sqrt n}
-    + \sqrt{\frac{\log n}{n^2h}}, \\
-    \sup_{\P \in \cP_\rd}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - f_W(w)\big|
-    \right]
-    &\lesssim
-    h^\beta
-    + \sqrt{\frac{\log n}{n^2h}}.
-  \end{align*}
-  %
-  Set $h = \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$
-  and note that $\beta \geq 1$ implies that
-  $\left(\frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}
-  \ll \frac{1}{\sqrt n}$.
-  So for $C > 0$,
-  %
-  \begin{align*}
-    \sup_{\P \in \cP}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - f_W(w)\big|
-    \right]
-    &\lesssim
-    \frac{1}{\sqrt n}
-    + \left(
-      \frac{\log n}{n^2}
-    \right)^{\frac{\beta}{2\beta+1}}
-    \leq
-    \frac{C}{\sqrt n}, \\
-    \sup_{\P \in \cP_\rd}
-    \E_\P\left[
-      \sup_{w \in \cW}
-      \big|\hat f_W(w) - f_W(w)\big|
-    \right]
-    &\leq
-    C\left(
-      \frac{\log n}{n^2}
-    \right)^{\frac{\beta}{2\beta+1}}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_covariance_structure}]
-
-  We write $k_{i j}$ for $k_h(W_{i j},w)$
-  and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity.
-  %
-  \begin{align*}
-    \Sigma_n(w,w')
-    &=
-    \E\Big[
-      \big(
-        \hat f_W(w)
-        - \E[\hat f_W(w)]
-      \big)
-      \big(
-        \hat f_W(w')
-        - \E[\hat f_W(w')]
-      \big)
-    \Big] \\
-    &=
-    \E\left[
-      \left(
-        \frac{2}{n(n-1)}
-        \sum_{i<j}
-        \big(
-          k_{i j} - \E k_{i j}
-        \big)
-      \right)
-      \left(
-        \frac{2}{n(n-1)}
-        \sum_{r<s}
-        \big(
-          k_{rs}' - \E k_{rs}'
-        \big)
-      \right)
-    \right] \\
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i<j}
-    \sum_{r<s}
-    \E\left[
-      \big(
-        k_{i j} - \E k_{i j}
-      \big)
-      \big(
-        k_{rs}' - \E k_{rs}'
-      \big)
-    \right] \\
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i<j}
-    \sum_{r<s}
-    \Cov\left[
-      k_{i j},
-      k_{rs}'
-    \right].
-  \end{align*}
-  %
-  Note first that
-  for $i,j,r,s$ all distinct,
-  $k_{i j}$ is independent of $k_{rs}'$
-  and so the covariance is zero.
-  By a counting argument,
-  it can be seen that
-  there are
-  $n(n-1)/2$
-  summands where
-  $|\{i,j,r,s\}| = 2$,
-  and
-  $n(n-1)(n-2)$
-  summands where
-  $|\{i,j,r,s\}| = 3$.
-  Therefore, since the samples
-  are identically distributed,
-  the value of the summands
-  depends only on the number of distinct indices
-  and we have the decomposition
-  %
-  \begin{align*}
-    \Sigma_n(w,w')
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \bigg(
-      \frac{n(n-1)}{2}
-      \Cov[k_{i j}, k_{i j}']
-      + n(n-1)(n-2)
-      \Cov[k_{i j}, k_{i r}']
-    \bigg) \\
-    &=
-    \frac{2}{n(n-1)}
-    \Cov[k_{i j}, k_{i j}']
-    + \frac{4(n-2)}{n(n-1)}
-    \Cov[k_{i j}, k_{i r}'],
-  \end{align*}
-  %
-  giving the first representation.
-  To obtain the second representation,
-  note that since
-  $W_{i j}$ and $W_{i r}$
-  are independent conditional
-  on $A_i$,
-  %
-  \begin{align*}
-    \Cov\big[
-      k_{i j}
-      k_{i r}'
-    \big]
-    &=
-    \E\big[
-      k_{i j}
-      k_{i r}'
-    \big]
-    -
-    \E[k_{i j}]
-    \E[k_{i r}']
-    =
-    \E\big[
-      \E\big[
-        k_{i j}
-        k_{i r}'
-        \mid A_i
-      \big]
-    \big]
-    -
-    \E[k_{i j}]
-    \E[k_{i r}'] \\
-    &=
-    \E\big[
-      \E[k_{i j} \mid A_i]
-      \E[k_{i r}' \mid A_i]
-    \big]
-    -
-    \E[k_{i j}]
-    \E[k_{i r}']
-    =
-    \Cov\big[
-      \E[k_{i j} \mid A_i],
-      \E[k_{i r}' \mid A_i]
-    \big].
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_variance_bounds}]
-
-  By Lemma~\ref{lem:kernel_app_covariance_structure},
-  the diagonal elements of $\Sigma_n$ are
-  %
-  \begin{align*}
-    \Sigma_n(w,w)
-    &=
-    \frac{2}{n(n-1)}
-    \Var\big[
-      k_h(W_{i j},w)
-    \big]
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \Var\big[
-      \E[k_h(W_{i j},w) \mid A_i]
-    \big].
-  \end{align*}
-  %
-  We bound each of the two terms separately.
-  Firstly, note that since $k_h$ is bounded by $C_\rk/h$,
-  %
-  \begin{align*}
-    \Var\big[
-      k_h(W_{i j},w)
-    \big]
-    \leq
-    \E\big[
-      k_h(W_{i j},w)^2
-    \big]
-    =
-    \int_{\cW \cap [w \pm h]}
-    k_h(s,w)^2
-    f_W(s)
-    \diff{s}
-    \leq 2 C_\rk^2 / h.
-  \end{align*}
-  %
-  Since $\big|\E[k_h(W_{i j},w)]\big|
-  = \big|\int_{[w \pm h] \cap \cW} k_h(s,w) f_W(s) \diff{s}\big|
-  \leq 2 C_\rk C_\rd$, Jensen's inequality shows
-  %
-  \begin{align*}
-    \Var\big[
-      k_h(W_{i j},w)
-    \big]
-    &\geq
-    \int_{\cW \cap [w \pm h]}
-    k_h(s,w)^2
-    f_W(s)
-    \diff{s}
-    - 4 C_\rk^2 C_\rd^2 \\
-    &\geq
-    \inf_{w \in \cW} f_W(w)
-    \frac{1}{2h}
-    \left(
-      \int_{\cW \cap [w \pm h]}
-      k_h(s,w)
-      \diff{s}
-    \right)^2
-    - 4 C_\rk^2 C_\rd^2 \\
-    &\geq
-    \frac{1}{2h}
-    \inf_{w \in \cW} f_W(w)
-    - 4 C_\rk^2 C_\rd^2
-    \geq
-    \frac{1}{4h}
-    \inf_{w \in \cW} f_W(w)
-  \end{align*}
-  %
-  for small enough $h$, noting that this is trivial if the infimum is zero.
-  For the other term,
-  %
-  \begin{align*}
-    \Var\big[
-      \E[k_h(W_{i j},w) \mid A_i]
-    \big]
-    &\leq
-    \Var\big[
-      f_{W \mid A}(w \mid A_i)
-    \big]
-    + 16 C_\rH C_\rk C_\rd h
-    \leq
-    2 \Du^2
-  \end{align*}
-  %
-  for small enough $h$, by a result from
-  the proof of Lemma~\ref{lem:kernel_trichotomy}.
-  Also
-  %
-  \begin{align*}
-    \Var\big[
-      \E[k_h(W_{i j},w) \mid A_i]
-    \big]
-    &\geq
-    \Var\big[
-      f_{W \mid A}(w \mid A_i)
-    \big]
-    - 16 C_\rH C_\rk C_\rd h
-    \geq
-    \frac{\Dl^2}{2}
-  \end{align*}
-  %
-  for small enough $h$.
-  Combining these four inequalities yields
-  that for all large enough $n$,
-  %
-  \begin{align*}
-    &\frac{2}{n(n-1)}
-    \frac{1}{4h}
-    \inf_{w \in \cW} f_W(w)
-    + \frac{4(n-2)}{n(n-1)}
-    \frac{\Dl^2}{2}
-    \leq
-    \inf_{w \in \cW} \Sigma_n(w,w) \\
-    &\qquad\leq
-    \sup_{w \in \cW} \Sigma_n(w,w)
-    \leq
-    \frac{2}{n(n-1)}
-    \frac{2 C_\rk^2}{h}
-    + \frac{4(n-2)}{n(n-1)}
-    2 \Du^2,
-  \end{align*}
-  %
-  so that
-  %
-  \begin{align*}
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}
-    \inf_{w \in \cW} f_W(w)
-    &\lesssim
-    \inf_{w \in \cW} \Sigma_n(w,w)
-    \leq
-    \sup_{w \in \cW} \Sigma_n(w,w)
-    \lesssim
-    \frac{\Du^2}{n}
-    + \frac{1}{n^2h}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_strong_approx_Ln}]
-
-  For the strong approximation,
-  apply the KMT corollary from
-  Lemma~\ref{lem:kernel_app_kmt_corollary}.
-  Define
-  %
-  \begin{align*}
-    k_h^A(a, w) = 2\E[k_h(W_{i j},w) \mid A_i = a],
-  \end{align*}
-  %
-  which are of bounded variation in $a$ uniformly over $w$ since
-  %
-  \begin{align*}
-    &\sup_{w \in \cW} \|k_h^A(\cdot,w)\|_\T
-    = 2\sup_{w \in \cW}
-    \sup_{m \in \N}
-    \sup_{a_0 \leq \cdots \leq a_m}
-    \sum_{i=1}^m
-    \big|k_h^A(a_i,w) - k_h^A(a_{i-1},w)\big| \\
-    &\quad=
-    2\sup_{w \in \cW}
-    \sup_{m \in \N}
-    \sup_{a_0 \leq \cdots \leq a_m}
-    \sum_{i=1}^m
-    \left|
-    \int_{[w \pm h] \cap \cW}
-    k_h(s,w)
-    \big(
-      f_{W \mid A}(s \mid a_i)
-      - f_{W \mid A}(s \mid a_{i-1})
-    \big)
-    \diff{s}
-    \right| \\
-    &\quad\leq
-    2 \sup_{w \in \cW}
-    \int_{[w \pm h] \cap \cW}
-    |k_h(s,w)|
-    \sup_{m \in \N}
-    \sup_{a_0 \leq \cdots \leq a_m}
-    \sum_{i=1}^m
-    \big|
-    f_{W \mid A}(s \mid a_i)
-    - f_{W \mid A}(s \mid a_{i-1})
-    \big|
-    \diff{s} \\
-    &\quad\leq
-    2 \sup_{w \in \cW}
-    \int_{[w \pm h] \cap \cW}
-    |k_h(s,w)|
-    \,
-    \big\|
-    f_{W \mid A}(w \mid \cdot)
-    \big\|_\TV
-    \diff{s} \\
-    &\quad\leq
-    4 C_\rk \sup_{w \in \cW}
-    \big\|
-    f_{W \mid A}(w \mid \cdot)
-    \big\|_\TV
-    \lesssim
-    \Du,
-  \end{align*}
-  %
-  where the last line is by observing that the total variation
-  is zero whenever $\Du = 0$.
-  Hence by Lemma~\ref{lem:kernel_app_kmt_corollary}
-  there exist (on some probability space)
-  $n$ independent copies of $A_i$,
-  denoted $A_i'$
-  and a centered Gaussian process $Z_n^{L\prime}$
-  such that if we define
-  %
-  \begin{align*}
-    L_n'(w)
-    &=
-    \frac{1}{n}
-    \sum_{i=1}^n
-    \big(k_h^A(A_i',w) -
-    \E[k_h^A(A_i',w)]\big),
-  \end{align*}
-  %
-  then for positive constants
-  $C_1, C_2, C_3$,
-  by defining the processes as zero outside $\cW$
-  we have
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big|
-      > \Du \frac{t + C_1 \log n}{\sqrt n}
-    \right)
-    \leq C_2 e^{-C_3 t}.
-  \end{align*}
-  %
-  Integrating tail probabilities shows that
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big|
-    \right]
-    &\leq
-    \Du \frac{C_1 \log n}{\sqrt n}
-    + \int_0^\infty
-    \frac{\Du}{\sqrt n}
-    C_2 e^{-C_3 t}
-    \diff{t}
-    \lesssim
-    \frac{\Du \log n}{\sqrt n}.
-  \end{align*}
-  %
-  Further,
-  $Z_n^{L\prime}$ has the
-  same covariance structure as $G_n^{L\prime}$ in the
-  sense that for all $w, w' \in \cW$,
-  %
-  \begin{align*}
-    \E\big[Z_n^{L\prime}(w) Z_n^{L\prime}(w')\big]
-    = \E\big[G_n^{L\prime}(w) G_n^{L\prime}(w')\big],
-  \end{align*}
-  %
-  and clearly $L_n'$
-  is equal in distribution to $L_n$.
-  To obtain the trajectory regularity property of
-  $Z_n^{L\prime}$,
-  note that it was shown in the proof of
-  Lemma~\ref{lem:kernel_uniform_concentration}
-  that for all $w,w' \in \cW$,
-  %
-  \begin{align*}
-    \left|
-    k_h^A(A_i,w)
-    - k_h^A(A_i,w')
-    \right|
-    &\leq
-    C
-    |w-w'|
-  \end{align*}
-  %
-  for some constant $C > 0$.
-  Therefore, since the $A_i$ are i.i.d.,
-  %
-  \begin{align*}
-    &\E\left[
-      \big|
-      Z_n^{L\prime}(w)
-      - Z_n^{L\prime}(w')
-      \big|^2
-    \right]^{1/2}
-    =
-    \sqrt{n}
-    \E\left[
-      \big|
-      L_n(w)
-      - L_n(w')
-      \big|^2
-    \right]^{1/2} \\
-    &\quad=
-    \sqrt{n}
-    \E\left[
-      \left|
-      \frac{1}{n}
-      \sum_{i=1}^n
-      \Big(
-        k_h^A(A_i,w)
-        - k_h^A(A_i,w')
-        - \E\big[k_h^A(A_i,w)]
-        + \E\big[k_h^A(A_i,w')]
-      \Big)
-      \right|^2
-    \right]^{1/2} \\
-    &\quad=
-    \E\left[
-      \Big|
-      k_h^A(A_i,w)
-      - k_h^A(A_i,w')
-      - \E\big[k_h^A(A_i,w)]
-      + \E\big[k_h^A(A_i,w')]
-      \Big|^2
-    \right]^{1/2}
-    \lesssim
-    |w-w'|.
-  \end{align*}
-  %
-  Therefore, by
-  the regularity result for Gaussian processes in
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  with $\delta_n \in (0, 1/2]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|
-      Z_n^{L\prime}(w)
-      - Z_n^{L\prime}(w')
-      \big|
-    \right]
-    &\lesssim
-    \int_0^{\delta_n}
-    \sqrt{\log 1/\varepsilon}
-    \diff{\varepsilon}
-    \lesssim
-    \delta_n \sqrt{\log 1/\delta_n}
-    \lesssim
-    \Du
-    \delta_n \sqrt{\log 1/\delta_n},
-  \end{align*}
-  %
-  where the last inequality is because
-  $Z_n^{L\prime} \equiv 0$ whenever $\Du = 0$.
-  There is a modification of $Z_n^{L\prime}$
-  with continuous trajectories
-  by Kolmogorov's continuity criterion
-  \citep[Theorem~2.9]{legall2016brownian}.
-  Note that $L_n'$ is $\bA_n'$-measurable
-  and so by Lemma~\ref{lem:kernel_app_kmt_corollary}
-  we can assume that $Z_n^{L\prime}$
-  depends only on $\bA_n'$ and some
-  random noise which is independent of
-  $(\bA_n', \bV_n')$.
-  Finally, in order to have
-  $\bA_n', \bV_n', L_n'$, and $Z_n^{L\prime}$
-  all defined on the same probability space,
-  we note that $\bA_n$ and $\bV_n$ are random vectors
-  while $L_n'$ and $Z_n^{L\prime}$
-  are stochastic processes
-  with continuous sample paths
-  indexed on
-  the compact interval $\cW$.
-  Hence the Vorob'ev--Berkes--Philipp theorem
-  (Lemma~\ref{lem:kernel_app_vbp})
-  allows us to ``glue'' them together
-  in the desired way
-  on another new probability space, giving
-  $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$,
-  retaining the single prime notation for clarity.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_strong_approx_Ln}]
-  See Lemma~\ref{lem:kernel_app_strong_approx_Ln}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}]
-
-  We apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditional on
-  $\bA_n$. While this lemma is not in its current form
-  stated for conditional distributions,
-  the Yurinskii coupling on which it depends can be readily extended
-  by following the proof of \citet[Lemma~38]{belloni2019conditional},
-  using a conditional version of Strassen's theorem
-  \cite[Theorem~B.2]{chen2020jackknife}.
-  Care must similarly be taken in embedding the conditionally Gaussian vectors
-  into a conditionally Gaussian process, using the
-  Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}).
-
-  By the mutual independence of $A_i$ and $V_{i j}$,
-  we have that the observations
-  $W_{i j}$ are independent
-  (but not necessarily identically distributed)
-  conditionally on $\bA_n$.
-  Note that
-  $\sup_{s,w \in \cW} |k_h(s,w)| \lesssim M_n = h^{-1}$
-  and
-  $\E[k_h(W_{i j},w)^2 \mid \bA_n] \lesssim \sigma_n^2 = h^{-1}$.
-  The following uniform Lipschitz condition holds
-  with $l_{n,\infty} = C_\rL h^{-2}$,
-  by the Lipschitz property of the kernels:
-  %
-  \begin{align*}
-    \sup_{s,w,w' \in \cW}
-    \left|
-    \frac{k_h(s, w) - k_h(s, w')}
-    {w-w'}
-    \right|
-    \leq
-    l_{n,\infty}.
-  \end{align*}
-  %
-  Also, the following $L^2$ Lipschitz condition holds
-  uniformly with $l_{n,2} = 2 C_\rL \sqrt{C_\rd} h^{-3/2}$:
-  %
-  \begin{align*}
-    &\E\big[
-      \big|
-      k_h(W_{i j}, w) - k_h(W_{i j}, w')
-      \big|^2
-      \mid \bA_n
-    \big]^{1/2} \\
-    &\quad\leq
-    \frac{C_\rL}{h^2}
-    |w-w'|
-    \left(
-      \int_{([w \pm h] \cup [w' \pm h]) \cap \cW}
-      f_{W \mid AA}(s \mid \bA_n)
-      \diff{s}
-    \right)^{1/2} \\
-    &\quad\leq
-    \frac{C_\rL}{h^2}
-    |w-w'|
-    \sqrt{4h C_\rd}
-    \leq
-    l_{n,2}
-    |w-w'|.
-  \end{align*}
-  %
-  So we apply
-  Lemma~\ref{lem:kernel_app_yurinskii_corollary}
-  conditionally on $\bA_n$
-  to the $\frac{1}{2}n(n-1)$ observations,
-  noting that
-  %
-  \begin{align*}
-    \sqrt{n^2h} E_n(w)
-    =
-    \sqrt{\frac{2 n h}{n-1}}
-    \sqrt{\frac{2}{n(n-1)}}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^{n}
-    \Big(
-      k_h(W_{i j},w)
-      - \E[k_h(W_{i j},w) \mid A_i, A_j]
-    \Big),
-  \end{align*}
-  %
-  to deduce that for $t_n > 0$ there exist
-  (an enlarged probability space)
-  conditionally mean-zero
-  and conditionally Gaussian processes
-  $\tilde Z_n^{E\prime}(w)$
-  with the same conditional covariance structure as
-  $\sqrt{n^2 h} E_n(w)$ and
-  satisfying
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \big|
-      \sqrt{n^2h} E_n(w) - \tilde Z_n^{E\prime}(w)
-      \big|
-      > t_n
-      \Bigm\vert \bA_n'
-    \right) \\
-    &\quad=
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \sqrt{\frac{n(n-1)}{2}} E_n(w)
-      - \sqrt{\frac{n-1}{2 n h}} \tilde Z_n^{E\prime}(w)
-      \right|
-      > \sqrt{\frac{n-1}{2 n h}}
-      t_n
-      \Bigm\vert \bA_n'
-    \right) \\
-    &\quad\lesssim
-    \frac{
-      \sigma_n
-      \sqrt{\Leb(\cW)}
-      \sqrt{\log n}
-      \sqrt{M_n + \sigma_n\sqrt{\log n}}
-    }{n^{1/2} t_n^2 / h}
-    \sqrt{
-      l_{n,2}
-      \sqrt{\log n}
-      + \frac{l_{n,\infty}}{n}
-    \log n} \\
-    &\quad\lesssim
-    \frac{
-      h^{-1/2}
-      \sqrt{\log n}
-      \sqrt{h^{-1} + h^{-1/2} \sqrt{\log n}}
-    }{n^{1/2} t_n^2 / h}
-    \sqrt{
-      h^{-3/2}
-      \sqrt{\log n}
-      + \frac{h^{-2}}{n}
-    \log n} \\
-    &\quad\lesssim
-    \sqrt{\frac{\log n}{n}}
-    \frac{
-      \sqrt{1 + \sqrt{h \log n}}
-    }{t_n^2}
-    \sqrt{
-      \sqrt{\frac{\log n}{h^3}}
-      \left( 1 + \sqrt{\frac{\log n}{n^2 h}} \right)
-    } \\
-    &\quad\lesssim
-    \sqrt{\frac{\log n}{n}}
-    \frac{ 1 }{t_n^2}
-    \left(
-      \frac{\log n}{h^3}
-    \right)^{1/4}
-    \lesssim
-    t_n^{-2}
-    n^{-1/2}
-    h^{-3/4}
-    (\log n)^{3/4},
-  \end{align*}
-  %
-  where we used
-  $h \lesssim 1 / \log n$
-  and $\frac{\log n}{n^2 h} \lesssim 1$.
-  To obtain the trajectory regularity property of
-  $\tilde Z_n^{E\prime}$,
-  note that
-  for $w, w' \in \cW$,
-  by conditional independence,
-  %
-  \begin{align*}
-    &\E\left[
-      \big|
-      \tilde Z_n^{E\prime}(w)
-      - \tilde Z_n^{E\prime}(w')
-      \big|^2
-      \mid \bA_n'
-    \right]^{1/2}
-    =
-    \sqrt{n^2h} \,
-    \E\left[
-      \big|
-      E_n(w)
-      - E_n(w')
-      \big|^2
-      \mid \bA_n
-    \right]^{1/2} \\
-    &\quad\lesssim
-    \sqrt{n^2h} \,
-    \E\left[
-      \left|
-      \frac{2}{n(n-1)}
-      \sum_{i=1}^{n-1}
-      \sum_{j=i+1}^{n}
-      \Big(
-        k_h(W_{i j},w)
-        - k_h(W_{i j},w')
-      \Big)
-      \right|^2
-      \Bigm\vert \bA_n
-    \right]^{1/2} \\
-    &\quad\lesssim
-    \sqrt{h} \,
-    \E\left[
-      \big|
-      k_h(W_{i j},w)
-      - k_h(W_{i j},w')
-      \big|^2
-      \bigm\vert \bA_n
-    \right]^{1/2}
-    \lesssim
-    h^{-1} |w-w'|.
-  \end{align*}
-  %
-  So by the regularity result for Gaussian processes in
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  with $\delta_n \in (0, 1/(2h)]$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|
-      \tilde Z_n^{E\prime}(w)
-      - \tilde Z_n^{E\prime}(w')
-      \big|
-      \mid \bA_n'
-    \right]
-    &\lesssim
-    \int_0^{\delta_n/h}
-    \sqrt{\log (\varepsilon^{-1} h^{-1})}
-    \diff{\varepsilon}
-    \lesssim
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h\delta_n}},
-  \end{align*}
-  %
-  and there exists a modification with continuous trajectories.
-  Finally, in order to have $\bA_n', \bV_n', E_n'$, and $\tilde Z_n^{E\prime}$
-  all defined on the same probability space,
-  we note that $\bA_n$ and $\bV_n$ are random vectors
-  while $E_n'$ and $\tilde Z_n^{E\prime}$ are stochastic processes
-  with continuous sample paths indexed on the compact interval $\cW$.
-  Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp})
-  allows us to ``glue together'' $\big(\bA_n, \bV_n, E_n\big)$
-  and $\big(E_n', \tilde Z_n^{E\prime}\big)$
-  in the desired way on another new probability space, giving
-  $\big(\bA_n', \bV_n', E_n', \tilde Z_n^{E\prime}\big)$,
-  retaining the single prime notation for clarity.
-
-  The trajectories of the conditionally Gaussian processes
-  $\tilde Z_n^{E\prime}$ depend on the choice of $t_n$,
-  necessitating the use of a divergent sequence $R_n$ to establish
-  bounds in probability.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_conditional_strong_approx_En}]
-  See Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}]
-
-  \proofparagraph{defining $Z_n^{E\dprime}$}
-
-  Pick $\delta_n \to 0$
-  with $\log 1/\delta_n \lesssim \log n$.
-  Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
-  with cardinality $\Leb(\cW)/\delta_n$
-  which is also a $\delta_n$-packing.
-  Let $\tilde Z_{n,\delta}^{E\prime}$
-  be the restriction of $\tilde Z_n^{E\prime}$
-  to $\cW_\delta$.
-  Let
-  $\tilde \Sigma_n^E(w, w') =
-  \E\big[\tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w')
-  \mid \bA_n' \big]$
-  be the conditional covariance function of $\tilde Z_n^{E\prime}$,
-  and define
-  $\Sigma_n^E(w,w') = \E\big[\tilde \Sigma_n^E(w,w')\big]$.
-  Let $\tilde \Sigma^E_{n,\delta}$ and $\Sigma^E_{n,\delta}$
-  be the restriction matrices of
-  $\tilde \Sigma^E_n$ and $\Sigma^E_n$
-  to $\cW_\delta \times \cW_\delta$,
-  noting that, as (conditional) covariance matrices,
-  these are
-  (almost surely)
-  positive semi-definite.
-
-  Let $N \sim \cN(0, I_{|\cW_\delta|})$
-  be independent of $\bA_n'$,
-  and define using the matrix square root
-  $\tilde Z_{n,\delta}^{E\dprime}
-  = \big(\tilde \Sigma^E_{n,\delta})^{1/2} N$,
-  which has the same distribution as
-  $\tilde Z_{n,\delta}^{E\prime}$,
-  conditional on $\bA_n'$.
-  Extend it using
-  the Vorob'ev--Berkes--Philipp theorem
-  (Lemma~\ref{lem:kernel_app_vbp})
-  to the compact interval $\cW$,
-  giving a conditionally Gaussian process
-  $\tilde Z_n^{E\dprime}$
-  which has the same distribution as
-  $\tilde Z_{n}^{E\prime}$,
-  conditional on $\bA_n'$.
-  Define
-  $Z_{n,\delta}^{E\dprime} = \big(\Sigma^E_{n,\delta})^{1/2} N$,
-  noting that this is independent of $\bA_n'$,
-  and extend it using
-  the Vorob'ev--Berkes--Philipp theorem
-  (Lemma~\ref{lem:kernel_app_vbp})
-  to a Gaussian process
-  $Z_n^{E\dprime}$ on the compact interval $\cW$,
-  which is independent of $\bA_n'$
-  and has covariance structure given by
-  $\Sigma_n^E$.
-
-  \proofparagraph{closeness of $Z_n^{E\dprime}$ and
-  $\tilde Z_n^{E\dprime}$ on the mesh}
-
-  Note that conditionally on $\bA_n'$,
-  $\tilde Z_{n,\delta}^{E\dprime} - Z_{n,\delta}^{E\dprime}$
-  is a length-$|\cW_\delta|$
-  Gaussian random vector with covariance matrix
-  $\big(
-    \big(\tilde \Sigma^E_{n,\delta}\big)^{1/2}
-    - \big(\Sigma^E_{n,\delta}\big)^{1/2}
-  \big)^2$.
-  So by the Gaussian maximal inequality in
-  Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}
-  applied conditionally on $\bA_n'$,
-  %
-  \begin{align*}
-    \E\left[
-      \max_{w \in \cW_\delta}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-      \Bigm| \bA_n'
-    \right]
-    &\lesssim
-    \sqrt{\log n}
-    \left\|
-    \tilde\Sigma^E_{n,\delta}
-    - \Sigma^E_{n,\delta}
-    \right\|_2^{1/2},
-  \end{align*}
-  %
-  since $\log |\cW_\delta| \lesssim \log n$.
-  Next, we apply some U-statistic theory to
-  $\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}$,
-  with the aim of applying the
-  matrix concentration result
-  for second-order U-statistics
-  presented in Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}.
-  Firstly, we note that
-  since
-  the conditional covariance structures of
-  $\tilde Z_n^{E\prime}$ and $\sqrt{n^2h} E_n$
-  are equal in distribution,
-  we have,
-  writing $E_n(\cW_\delta)$
-  for the vector $\big(E_n(w) : w \in \cW_\delta\big)$
-  and similarly for $k_h(W_{i j}, \cW_\delta)$,
-  %
-  \begin{align*}
-    \tilde\Sigma^E_{n,\delta}
-    &=
-    n^2h \E[E_n(\cW_\delta) E_n(\cW_\delta)^\T \mid \bA_n] \\
-    &=
-    n^2h
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^{n}
-    \E\left[
-      \Big(
-        k_h(W_{i j}, \cW_\delta)
-        - \E\left[
-          k_h(W_{i j}, \cW_\delta)
-          \mid \bA_n
-        \right]
-      \Big)
-      \right. \\
-      &\qquad\left.
-      \times\Big(
-        k_h(W_{i j}, \cW_\delta)
-        - \E\left[
-          k_h(W_{i j}, \cW_\delta)
-          \mid \bA_n
-        \right]
-      \Big)^\T
-      \bigm\vert \bA_n
-    \right] \\
-    &=
-    \frac{4h}{(n-1)^2}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^{n}
-    u(A_i, A_j),
-  \end{align*}
-  %
-  where we
-  define the random
-  $|\cW_\delta| \times |\cW_\delta|$
-  matrices
-  %
-  \begin{align*}
-    u(A_i, A_j)
-    &=
-    \E\!\left[
-      k_h(W_{i j}, \cW_\delta)
-      k_h(W_{i j}, \cW_\delta)^\T
-      \mid \bA_n
-    \right]
-    -
-    \E\!\left[
-      k_h(W_{i j}, \cW_\delta)
-      \mid \bA_n
-    \right]
-    \E\!\left[
-      k_h(W_{i j}, \cW_\delta)
-      \mid \bA_n
-    \right]^\T.
-  \end{align*}
-  %
-  Let $u(A_i) = \E[u(A_i, A_j) \mid A_i]$ and
-  $u = \E[u(A_i, A_j)]$.
-  The decomposition
-  $\tilde \Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
-  = \tilde L +\tilde Q$
-  holds by Lemma~\ref{lem:kernel_app_general_hoeffding}, where
-  %
-  \begin{align*}
-    \tilde L
-    &=
-    \frac{4h}{n-1}
-    \sum_{i=1}^n
-    \big(
-      u(A_i) - u
-    \big),
-    &\tilde Q
-    &=
-    \frac{4h}{(n-1)^2}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^{n}
-    \big(
-      u(A_i, A_j) - u(A_i) - u(A_j) + u
-    \big).
-  \end{align*}
-  %
-  Next, we seek an almost sure upper bound on
-  $\|u(A_i, A_j)\|_2$.
-  Since this is a symmetric matrix,
-  we have by H{\"o}lder's inequality
-  %
-  \begin{align*}
-    \|u(A_i, A_j)\|_2
-    &\leq
-    \|u(A_i, A_j)\|_1^{1/2}
-    \|u(A_i, A_j)\|_\infty^{1/2}
-    =
-    \max_{1 \leq k \leq |\cW_\delta|}
-    \sum_{l=1}^{|\cW_\delta|}
-    |u(A_i, A_j)_{kl}|.
-  \end{align*}
-  %
-  The terms on the right hand side can be bounded as follows,
-  writing $w, w'$ for the $k$th and $l$th
-  points in $\cW_\delta$ respectively:
-  %
-  \begin{align*}
-    |u(A_i, A_j)_{kl}|
-    &=
-    \big|
-    \E\left[
-      k_h(W_{i j}, w)
-      k_h(W_{i j}, w')
-      \mid \bA_n
-    \right]
-    -
-    \E\left[
-      k_h(W_{i j}, w)
-      \mid \bA_n
-    \right]
-    \E\left[
-      k_h(W_{i j}, w')
-      \mid \bA_n
-    \right]
-    \big| \\
-    &\lesssim
-    \E\left[
-      |
-      k_h(W_{i j}, w)
-      k_h(W_{i j}, w')
-      |
-      \mid \bA_n
-    \right]
-    +
-    \E\left[
-      |
-      k_h(W_{i j}, w)
-      |
-      \mid \bA_n
-    \right]
-    \E\left[
-      |
-      k_h(W_{i j}, w')
-      |
-      \mid \bA_n
-    \right] \\
-    &\lesssim
-    h^{-1}
-    \I\big\{ |w-w'| \leq 2h \big\}
-    + 1
-    \lesssim
-    h^{-1}
-    \I\big\{ |k-l| \leq 2h/\delta_n \big\}
-    + 1,
-  \end{align*}
-  %
-  where we used that
-  $|w-w'| \geq |k-l| \delta_n$
-  because $\cW_\delta$
-  is a $\delta_n$-packing.
-  Hence
-  %
-  \begin{align*}
-    \|u(A_i, A_j)\|_2
-    &\leq
-    \max_{1 \leq k \leq |\cW_\delta|}
-    \sum_{l=1}^{|\cW_\delta|}
-    |u(A_i, A_j)_{kl}|
-    \lesssim
-    \max_{1 \leq k \leq |\cW_\delta|}
-    \sum_{l=1}^{|\cW_\delta|}
-    \Big(
-      h^{-1}
-      \I\big\{ |k-l| \leq 2h/\delta_n \big\}
-      + 1
-    \Big) \\
-    &\lesssim
-    1/\delta_n
-    + 1/h
-    + |\cW_\delta|
-    \lesssim
-    1/\delta_n
-    + 1/h.
-  \end{align*}
-  %
-  Clearly, the same bound holds for
-  $\|u(A_i)\|_2$ and $\|u\|_2$, by Jensen's inequality.
-  Therefore, applying the matrix Bernstein inequality
-  (Lemma~\ref{lem:kernel_app_matrix_bernstein})
-  to the zero-mean matrix $\tilde L$ gives
-  %
-  \begin{align*}
-    \E\left[
-      \left\|
-      \tilde L
-      \right\|_2
-    \right]
-    &\lesssim
-    \frac{h}{n}
-    \left(\frac{1}{\delta_n} + \frac{1}{h} \right)
-    \left(
-      \log |\cW_\delta| + \sqrt{n \log |\cW_\delta|}
-    \right)
-    \lesssim
-    \left(\frac{h}{\delta_n} + 1 \right)
-    \sqrt{\frac{\log n}{n}}.
-  \end{align*}
-  %
-  The matrix U-statistic concentration inequality
-  (Lemma~\ref{lem:kernel_app_ustat_matrix_concentration})
-  with $\tilde Q$ gives
-  %
-  \begin{align*}
-    \E\left[
-      \big\|
-      \tilde Q
-      \big\|_2
-    \right]
-    &\lesssim
-    \frac{h}{n^2}
-    n
-    \left(\frac{1}{\delta_n} + \frac{1}{h} \right)
-    \left(
-      \log |\cW_\delta|
-    \right)^{3/2}
-    \lesssim
-    \left(\frac{h}{\delta_n} + 1 \right)
-    \frac{(\log n)^{3/2}}{n}.
-  \end{align*}
-  %
-  Hence taking a marginal expectation
-  and applying Jensen's inequality,
-  %
-  \begin{align*}
-    &\E\left[
-      \max_{w \in \cW_\delta}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-    \right] \\
-    &\quad\lesssim
-    \sqrt{\log n} \
-    \E\left[
-      \left\|
-      \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
-      \right\|_2^{1/2}
-    \right]
-    \lesssim
-    \sqrt{\log n} \
-    \E\left[
-      \left\|
-      \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
-      \right\|_2
-    \right]^{1/2} \\
-    &\quad\lesssim
-    \sqrt{\log n} \
-    \E\left[
-      \left\|
-      \tilde L
-      + \tilde Q
-      \right\|_2
-    \right]^{1/2}
-    \lesssim
-    \sqrt{\log n} \
-    \E\left[
-      \left\|
-      \tilde L
-      \right\|_2
-      + \left\|
-      \tilde Q
-      \right\|_2
-    \right]^{1/2} \\
-    &\quad\lesssim
-    \sqrt{\log n}
-    \left(
-      \left(\frac{h}{\delta_n} + 1 \right)
-      \sqrt{\frac{\log n}{n}}
-      + \left(\frac{h}{\delta_n} + 1 \right)
-      \frac{(\log n)^{3/2}}{n}
-    \right)^{1/2} \\
-    &\quad\lesssim
-    \sqrt{\frac{h}{\delta_n} + 1}
-    \frac{(\log n)^{3/4}}{n^{1/4}}.
-  \end{align*}
-
-  \proofparagraph{regularity of $Z_n^E$ and $\tilde Z_n^{E\prime}$}
-
-  Define the semimetrics
-  %
-  \begin{align*}
-    \rho(w, w')^2
-    &=
-    \E\left[
-      \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|^2
-    \right],
-    &\tilde\rho(w, w')^2
-    &=
-    \E\left[
-      \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2
-      \mid \bA_n
-    \right].
-  \end{align*}
-  %
-  We bound $\tilde \rho$ as follows,
-  since $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$
-  have the same conditional covariance structure:
-  %
-  \begin{align*}
-    \tilde\rho(w, w')
-    &=
-    \E\left[
-      \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2
-      \mid \bA_n'
-    \right]^{1/2} \\
-    &=
-    \sqrt{n^2 h} \,
-    \E\left[
-      \big|E_n(w) - E_n(w')\big|^2
-      \mid \bA_n'
-    \right]^{1/2}
-    \lesssim
-    h^{-1}
-    |w-w'|,
-  \end{align*}
-  %
-  uniformly in $\bA_n'$,
-  where the last line was shown in
-  the proof of Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}.
-  Note that also
-  %
-  \begin{align*}
-    \rho(w, w')
-    &=
-    \sqrt{\E[\tilde \rho(w,w')^2]}
-    \lesssim
-    h^{-1}
-    |w-w'|.
-  \end{align*}
-  %
-  Thus Lemma~\ref{lem:kernel_app_gaussian_process_maximal}
-  applies directly to $Z_n^E$
-  and conditionally to $\tilde Z_n^{E\prime}$,
-  with $\delta_n \in (0, 1/(2h)]$,
-  demonstrating that
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|
-      \bigm\vert \bA_n'
-    \right]
-    &\lesssim
-    \int_0^{\delta_n / h}
-    \sqrt{\log (1 / (\varepsilon h))}
-    \diff{\varepsilon}
-    \lesssim
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h \delta_n}}, \\
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      |Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')|
-    \right]
-    &\lesssim
-    \int_0^{\delta_n / h}
-    \sqrt{\log (1 / (\varepsilon h))}
-    \diff{\varepsilon}
-    \lesssim
-    \frac{\delta_n}{h}
-    \sqrt{\log \frac{1}{h \delta_n}}.
-  \end{align*}
-  %
-  Continuity of trajectories follows from this.
-
-  \proofparagraph{conclusion}
-
-  We use the previous parts to deduce that
-  %
-  \begin{align*}
-    &\E\left[
-      \sup_{w \in \cW}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-    \right] \\
-    &\quad\lesssim
-    \E\left[
-      \max_{w \in \cW_\delta}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-    \right] \\
-    &\qquad+
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \left\{
-        \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|
-        + \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|
-      \right\}
-    \right] \\
-    &\quad\lesssim
-    \sqrt{\frac{h}{\delta_n} + 1}
-    \frac{(\log n)^{3/4}}{n^{1/4}}
-    + \frac{\delta_n \sqrt{\log n}}{h}.
-  \end{align*}
-  %
-  Setting
-  $\delta_n = h \left( \frac{\log n}{n} \right)^{1/6}$
-  gives
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
-    \right]
-    &\lesssim
-    n^{-1/6} (\log n)^{2/3}.
-  \end{align*}
-  %
-  Independence of $Z_n^{E\dprime}$ and $\bA_n''$
-  follows by applying the
-  Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}),
-  conditionally on $\bA_n'$, to the variables
-  $\big(\bA_n', \tilde Z_n^{E\prime}\big)$ and
-  $\big(\tilde Z_n^{E\dprime}, Z_n^{E\dprime}\big)$.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_unconditional_strong_approx_En}]
-  See Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_app_strong_approx_fW}]
-
-  We add together the strong approximations
-  for the $L_n$ and $E_n$ terms,
-  and then add an independent Gaussian process
-  to account for the variance of $Q_n$.
-
-  \proofparagraph{gluing together the strong approximations}
-
-  Let $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$
-  be the strong approximation for $L_n$
-  derived in Lemma~\ref{lem:kernel_app_strong_approx_Ln}.
-  Let $\big(\bA_n'', \bV_n'', E_n'', \tilde Z_n^{E\dprime}\big)$
-  and
-  $\big(\bA_n''', \bV_n''', \tilde Z_n^{E\tprime}, Z_n^{E\tprime}\big)$
-  be the conditional and unconditional strong approximations for $E_n$
-  given in Lemmas~\ref{lem:kernel_app_conditional_strong_approx_En}
-  and \ref{lem:kernel_app_unconditional_strong_approx_En}
-  respectively.
-  The first step is to define copies of these variables
-  and processes on the same probability space.
-  This is achieved by applying the
-  Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}).
-  Dropping the prime notation for clarity, we construct
-  $\big(\bA_n, \bV_n, L_n, Z_n^L, E_n, \tilde Z_n^E, Z_n^E\big)$
-  with the following properties:
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      $\sup_{w \in \cW}
-      \big| \sqrt{n} L_n(w) - Z_n^L(w)\big|
-      \lesssim_\P n^{-1/2} \log n$,
-
-    \item
-      $\sup_{w \in \cW}
-      \big|\sqrt{n^2h} E_n(w) - \tilde Z^E_n(w) \big|
-      \lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$,
-
-    \item
-      $\sup_{w \in \cW}
-      \big| \tilde Z^E_n(w) - Z^E_n(w) \big|
-      \lesssim_\P n^{-1/6} (\log n)^{2/3}$,
-
-    \item
-      $Z_n^L$ is independent of $Z_n^E$.
-
-  \end{enumerate}
-  %
-  Note that the independence of
-  $Z_n^L$ and $Z_n^E$
-  follows since $Z_n^L$
-  depends only on $\bA_n$ and some independent random noise,
-  while $Z_n^E$ is independent of $\bA_n$.
-  Therefore $(Z_n^L, Z_n^E)$ are jointly Gaussian.
-  To get the strong approximation result
-  for $\hat f_W$,
-  define the Gaussian process
-  %
-  \begin{align*}
-    Z_n^f(w)
-    &=
-    \frac{1}{\sqrt n} Z_n^L(w)
-    + \frac{1}{n} Z_n^Q(w)
-    + \frac{1}{\sqrt{n^2h}} Z_n^E(w),
-  \end{align*}
-  %
-  where $Z_n^Q(w)$ is a mean-zero Gaussian process
-  independent of everything else
-  with covariance
-  %
-  \begin{align*}
-    \E\big[
-      Z_n^Q(w)
-      Z_n^Q(w')
-    \big]
-    &=
-    n^2 \E\big[
-      Q_n(w)
-      Q_n(w')
-    \big].
-  \end{align*}
-  %
-  As shown in the proof of
-  Lemma~\ref{lem:kernel_uniform_concentration},
-  the process
-  $Q_n(w)$ is uniformly Lipschitz
-  and uniformly bounded in $w$.
-  Thus by Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  we have
-  $\E\big[\sup_{w \in \cW}
-  |Z_n^Q(w)|\big]
-  \lesssim 1$.
-  Therefore the uniform approximation error is given by
-  %
-  \begin{align*}
-    &
-    \sup_{w \in \cW}
-    \big|
-    \hat f_W(w) - \E[\hat f_W(w)]
-    - Z_n^f(w)
-    \big|
-    \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    \frac{1}{\sqrt n} Z_n^L(w)
-    + \frac{1}{n} Z_n^Q(w)
-    + \frac{1}{\sqrt{n^2h}} Z_n^E(w)
-    - \Big(
-      L_n(w) + Q_n(w) + E_n(w)
-    \Big)
-    \right| \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \bigg(
-      \frac{1}{\sqrt n}
-      \left|
-      Z_n^L(w) - \sqrt{n} L_n(w)
-      \right|
-      + \frac{1}{\sqrt{n^2h}}
-      \left|
-      \tilde Z_n^E(w) - \sqrt{n^2h} E_n(w)
-      \right| \\
-      &\qquad+
-      \frac{1}{\sqrt{n^2h}}
-      \left|
-      Z_n^E(w) - \tilde Z_n^E(w)
-      \right|
-      \big| Q_n(w) \big|
-      + \frac{1}{n}
-      \big| Z_n^Q(w) \big|
-    \bigg) \\
-    &\quad\lesssim_\P
-    n^{-1} \log n
-    + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
-    + n^{-7/6} h^{-1/2} (\log n)^{2/3}.
-  \end{align*}
-
-  \proofparagraph{covariance structure}
-
-  Since $L_n$, $Q_n$, and $E_n$
-  are mutually orthogonal in $L^2$
-  (as shown in Lemma~\ref{lem:kernel_hoeffding}),
-  we have the following covariance
-  structure:
-  %
-  \begin{align*}
-    \E\big[Z_n^f(w) Z_n^f(w')\big]
-    &=
-    \frac{1}{n} \E\big[ Z_n^L(w) Z_n^L(w') \big]
-    + \frac{1}{n^2} \E\big[ Z_n^Q(w) Z_n^Q(w') \big]
-    + \frac{1}{n^2h} \E\big[ Z_n^E(w) Z_n^E(w') \big] \\
-    &=
-    \E\big[ L_n(w) L_n(w') \big]
-    + \E\big[ Q_n(w) Q_n(w') \big]
-    + \E\big[ E_n(w) E_n(w') \big] \\
-    &=
-    \E\big[
-      \big(\hat f_W(w) - \E[\hat f_W(w)]\big)
-      \big(\hat f_W(w') - \E[\hat f_W(w')]\big)
-    \big].
-  \end{align*}
-
-  \proofparagraph{trajectory regularity}
-
-  The trajectory regularity of the process
-  $Z_n^f$ follows directly by adding the regularities
-  of the processes $\frac{1}{\sqrt n} Z_n^L$,
-  $\frac{1}{n} Z_n^Q$, and $\frac{1}{\sqrt{n^2h}} Z_n^E$.
-  Similarly, $Z_n^f$ has continuous trajectories.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_strong_approx_Tn}]
-
-  Define $Z_n^T(w) = \frac{Z_n^f(w)}{\sqrt{\Sigma_n(w,w)}}$ so that
-  %
-  \begin{align*}
-    \left| T_n(w) - Z_n^T(w) \right|
-    &= \frac{\big| \hat f_W(w) - f_W(w) - Z_n^f(w) \big|}
-    {\sqrt{\Sigma_n(w,w)}}.
-  \end{align*}
-  %
-  By Theorems~\ref{thm:kernel_app_strong_approx_fW} and \ref{thm:kernel_bias},
-  the numerator can be bounded above by
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left|
-    \hat f_W(w) - f_W(w)
-    -
-    Z_n^f(w)
-    \right| \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \left|
-    \hat f_W(w)
-    - \E\big[\hat f_W(w)\big]
-    -
-    Z_n^f(w)
-    \right|
-    + \sup_{w \in \cW}
-    \left|
-    \E\big[\hat f_W(w)\big]
-    - f_W(w)
-    \right| \\
-    &\quad\lesssim_\P
-    n^{-1} \log n
-    + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
-    + n^{-7/6} h^{-1/2} (\log n)^{2/3}
-    + h^{p \wedge \beta}.
-  \end{align*}
-  %
-  By Lemma~\ref{lem:kernel_variance_bounds}
-  with $\inf_\cW f_W(w) > 0$,
-  the denominator is bounded below by
-  %
-  \begin{align*}
-    \inf_{w \in \cW}
-    \sqrt{\Sigma_n(w,w)}
-    &\gtrsim
-    \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}},
-  \end{align*}
-  %
-  and the result follows.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_infeasible_ucb}]
-
-  Note that the covariance structure of $Z_n^T$ is given by
-  %
-  \begin{align*}
-    \Cov\big[
-      Z_n^T(w),
-      Z_n^T(w')
-    \big]
-    &=
-    \frac{\Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}.
-  \end{align*}
-  %
-  We apply an anti-concentration result
-  to establish that all quantiles of
-  $\sup_{w \in \cW} \big|Z_n^T(w)\big|$ exist.
-  To do this, we must first establish regularity
-  properties of $Z_n^T$.
-
-  \proofparagraph{$L^2$ regularity of $Z_n^T$}
-
-  Writing $k_{i j}'$ for $k_h(W_{i j},w')$ etc.,
-  note that by Lemma~\ref{lem:kernel_app_covariance_structure},
-  %
-  \begin{align*}
-    &\big|
-    \Sigma_n(w,w')
-    -
-    \Sigma_n(w, w'')
-    \big| \\
-    &\quad=
-    \left|
-    \frac{2}{n(n-1)}
-    \Cov\big[
-      k_{i j},
-      k_{i j}'
-    \big]
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \Cov\big[
-      k_{i j},
-      k_{i r}'
-    \big]
-    \right. \\
-    &\left.
-    \quad\qquad-
-    \frac{2}{n(n-1)}
-    \Cov\big[
-      k_{i j},
-      k_{i j}''
-    \big]
-    -
-    \frac{4(n-2)}{n(n-1)}
-    \Cov\big[
-      k_{i j},
-      k_{i r}''
-    \big]
-    \right| \\
-    &\quad\leq
-    \frac{2}{n(n-1)}
-    \Big|
-    \Cov\big[
-      k_{i j},
-      k_{i j}' - k_{i j}''
-    \big]
-    \Big|
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \Big|
-    \Cov\big[
-      k_{i j},
-      k_{i r}' - k_{i r}''
-    \big]
-    \Big| \\
-    &\quad\leq
-    \frac{2}{n(n-1)}
-    \|k_{i j}\|_\infty
-    \|k_{i j}' - k_{i j}''\|_\infty
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \|k_{i j}\|_\infty
-    \|k_{i r}' - k_{i r}''\|_\infty \\
-    &\quad\leq
-    \frac{4}{n h^3}
-    C_\rk C_\rL
-    |w'-w''|
-    \lesssim
-    n^{-1}h^{-3} |w'-w''|
-  \end{align*}
-  %
-  uniformly in $w, w', w'' \in \cW$.
-  Therefore, by Lemma~\ref{lem:kernel_variance_bounds},
-  with $\delta_n \leq n^{-2} h^2$,
-  we have
-  %
-  \begin{align*}
-    \inf_{|w-w'| \leq \delta_n}
-    \Sigma_n(w,w')
-    &\gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}
-    - n^{-1} h^{-3} \delta_n
-    \gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}
-    - \frac{1}{n^3h}
-    \gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}, \\
-    \sup_{|w-w'| \leq \delta_n}
-    \Sigma_n(w,w')
-    &\lesssim
-    \frac{\Du^2}{n}
-    + \frac{1}{n^2h}
-    + n^{-1} h^{-3} \delta_n
-    \lesssim
-    \frac{\Du^2}{n}
-    + \frac{1}{n^2h}
-    + \frac{1}{n^3h}
-    \lesssim
-    \frac{\Du^2}{n}
-    + \frac{1}{n^2h}.
-  \end{align*}
-  %
-  The $L^2$
-  regularity of $Z_n^T$
-  is
-  %
-  \begin{align*}
-    \E\left[
-      \big(
-        Z_n^T(w) - Z_n^T(w')
-      \big)^2
-    \right]
-    &=
-    2 - 2
-    \frac{\Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}.
-  \end{align*}
-  %
-  Applying the elementary result
-  that for $a,b,c > 0$,
-  %
-  \begin{align*}
-    1 - \frac{a}{\sqrt{b c}}
-    &=
-    \frac{b(c-a) + a(b-a)}
-    {\sqrt{b c}\big(\sqrt{b c} + a\big)},
-  \end{align*}
-  %
-  with $a = \Sigma_n(w,w')$,
-  $b = \Sigma_n(w,w)$,
-  and $c = \Sigma_n(w',w')$,
-  and noting $|c-a| \lesssim n^{-1} h^{-3} |w-w'|$
-  and $|b-a| \lesssim n^{-1} h^{-3} |w-w'|$ and
-  $\frac{\Dl^2}{n} + \frac{1}{n^2h}
-  \lesssim a,b,c \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}$,
-  yields
-  %
-  \begin{align*}
-    \E\left[
-      \big(
-        Z_n^T(w) - Z_n^T(w')
-      \big)^2
-    \right]
-    &\lesssim
-    \frac{(\Du^2/n + 1/(n^2h))n^{-1}h^{-3}|w-w'|}
-    {(\Dl^2/n + 1/(n^2h))^2} \\
-    &\lesssim
-    \frac{n^{2} h^{-4}|w-w'|}
-    {n^{-4}h^{-2}}
-    \lesssim
-    n^2 h^{-2} |w-w'|.
-  \end{align*}
-  %
-  Thus the semimetric
-  induced by $Z_n^T$ on $\cW$ is
-  %
-  \begin{align*}
-    \rho(w,w')
-    &\vcentcolon=
-    \E\left[
-      \big(
-        Z_n^T(w) - Z_n^T(w')
-      \big)^2
-    \right]^{1/2}
-    \lesssim
-    n h^{-1} \sqrt{|w-w'|}.
-  \end{align*}
-
-  \proofparagraph{trajectory regularity of $Z_n^T$}
-
-  By the bound on $\rho$ from the previous part,
-  we deduce the covering number bound
-  %
-  \begin{align*}
-    N(\varepsilon, \cW, \rho)
-    &\lesssim
-    N\big(
-      \varepsilon,
-      \cW,
-      n h^{-1} \sqrt{|\cdot|}
-    \big)
-    \lesssim
-    N\big(
-      n^{-1} h \varepsilon,
-      \cW,
-      \sqrt{|\cdot|}
-    \big) \\
-    &\lesssim
-    N\big(
-      n^{-2} h^2 \varepsilon^2,
-      \cW,
-      |\cdot|
-    \big)
-    \lesssim
-    n^2 h^{-2} \varepsilon^{-2}.
-  \end{align*}
-  %
-  Now apply the Gaussian process regularity result from
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal}.
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{\rho(w,w') \leq \delta}
-      \big| Z_n^T(w) - Z_n^T(w') \big|
-    \right]
-    &\lesssim
-    \int_0^{\delta}
-    \sqrt{\log N(\varepsilon, \cW, \rho)}
-    \diff{\varepsilon}
-    \lesssim
-    \int_0^{\delta}
-    \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})}
-    \diff{\varepsilon} \\
-    &\lesssim
-    \int_0^{\delta}
-    \left(
-      \sqrt{\log n}
-      + \sqrt{\log 1/\varepsilon}
-    \right)
-    \diff{\varepsilon}
-    \lesssim
-    \delta
-    \left(
-      \sqrt{\log n}
-      + \sqrt{\log 1/\delta}
-    \right),
-  \end{align*}
-  %
-  and so
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big| Z_n^T(w) - Z_n^T(w') \big|
-    \right]
-    &\lesssim
-    \E\left[
-      \sup_{\rho(w,w') \leq n h^{-1} \delta_n^{1/2}}
-      \big| Z_n^T(w) - Z_n^T(w') \big|
-    \right]
-    \lesssim
-    n h^{-1}
-    \sqrt{\delta_n \log n},
-  \end{align*}
-  %
-  whenever $1/\delta_n$
-  is at most polynomial in $n$.
-
-  \proofparagraph{existence of the quantile}
-
-  Apply the Gaussian anti-concentration
-  result from Lemma~\ref{lem:kernel_app_anticoncentration},
-  noting that $Z_n^T$ is separable,
-  mean-zero, and has unit variance:
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq 2\varepsilon_n
-    \right)
-    &\leq
-    8 \varepsilon_n
-    \left(
-      1 + \E\left[
-        \sup_{w \in \cW}
-        \big| Z_n^T(w) \big|
-      \right]
-    \right).
-  \end{align*}
-  %
-  To bound the supremum on the right hand side,
-  apply the Gaussian process maximal inequality from
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal}
-  with
-  $\sigma \leq 1$ and
-  $N(\varepsilon, \cW, \rho) \lesssim n^2 h^{-2} \varepsilon^{-2}$:
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \big|Z_n^T(w)\big|
-    \right]
-    &\lesssim
-    1
-    + \int_0^{2}
-    \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})}
-    \diff{\varepsilon}
-    \lesssim
-    \sqrt{\log n}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq \varepsilon
-    \right)
-    &\lesssim
-    \varepsilon
-    \sqrt{\log n}.
-  \end{align*}
-  %
-  Letting $\varepsilon \to 0$
-  shows that the distribution function of
-  $\sup_{w \in \cW} \big|Z_n^T(w)\big|$
-  is continuous,
-  and therefore all of its quantiles exist.
-
-  \proofparagraph{validity of the infeasible uniform confidence band}
-
-  Under Assumption~\ref{ass:kernel_rates} and with a
-  sufficiently slowly diverging sequence $R_n$,
-  the strong approximation rate established in
-  Theorem~\ref{thm:kernel_strong_approx_Tn} is
-  %
-  \begin{align*}
-    &\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\
-    &\quad\lesssim_\P
-    \frac{
-      n^{-1/2} \log n
-      + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
-      + n^{-2/3} h^{-1/2} (\log n)^{2/3}
-    + n^{1/2} h^{p \wedge \beta}}
-    {\Dl + 1/\sqrt{n h}}
-    \ll \frac{1}{\sqrt{\log n}}.
-  \end{align*}
-  %
-  So by Lemma~\ref{lem:kernel_app_slow_convergence}, take $\varepsilon_n$ such
-  that
-  %
-  \begin{align*}
-    \P \left(
-      \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right|
-      > \varepsilon_n
-    \right)
-    &\leq
-    \varepsilon_n \sqrt{\log n}
-  \end{align*}
-  %
-  and $\varepsilon_n \sqrt{\log n} \to 0$.
-  So by the previously established anti-concentration result,
-  %
-  \begin{align*}
-    &\P\left(
-      \left|
-      \hat f_W(w) - f_W(w)
-      \right|
-      \leq
-      q_{1-\alpha}
-      \sqrt{\Sigma_n(w,w)}
-      \textup{ for all }
-      w \in \cW
-    \right) \\
-    &\quad=
-    \P\left(
-      \sup_{w \in \cW}
-      \left| T_n(w) \right|
-      \leq
-      q_{1-\alpha}
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| Z_n^T(w) \right|
-      \leq
-      q_{1-\alpha}
-      + \varepsilon_n
-    \right)
-    + \P \left(
-      \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right|
-      > \varepsilon_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq
-      q_{1-\alpha}
-    \right)
-    + \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - q_{1-\alpha}
-      \right|
-      \leq \varepsilon_n
-    \right)
-    + \varepsilon_n \sqrt{\log n} \\
-    &\quad\leq
-    1 - \alpha
-    + 2 \varepsilon_n \sqrt{\log n}.
-  \end{align*}
-  %
-  The lower bound follows analogously:
-  %
-  \begin{align*}
-    &\P\left(
-      \left|
-      \hat f_W(w) - f_W(w)
-      \right|
-      \leq
-      q_{1-\alpha}
-      \sqrt{\Sigma_n(w,w)}
-      \textup{ for all }
-      w \in \cW
-    \right) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| Z_n^T(w) \right|
-      \leq
-      q_{1-\alpha}
-      - \varepsilon_n
-    \right)
-    - \varepsilon_n \sqrt{\log n} \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq
-      q_{1-\alpha}
-    \right)
-    - \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - q_{1-\alpha}
-      \right|
-      \leq \varepsilon_n
-    \right)
-    - \varepsilon_n \sqrt{\log n} \\
-    &\quad\leq
-    1 - \alpha
-    - 2 \varepsilon_n \sqrt{\log n}.
-  \end{align*}
-  %
-  Finally, we apply $\varepsilon_n \sqrt{\log n} \to 0$
-  to see
-  %
-  \begin{align*}
-    \left|
-    \P\left(
-      \left|
-      \hat f_W(w) - f_W(w)
-      \right|
-      \leq
-      q_{1-\alpha}
-      \sqrt{\Sigma_n(w,w)}
-      \textup{ for all }
-      w \in \cW
-    \right)
-    - (1 - \alpha)
-    \right|
-    &\to 0.
-  \end{align*}
-\end{proof}
-
-Before proving
-Lemma~\ref{lem:kernel_app_covariance_estimation},
-we provide the following useful
-concentration inequality.
-This is essentially a corollary of the
-U-statistic concentration inequality given in
-Theorem~3.3 in \citet{gine2000exponential}.
-
-\begin{lemma}[A concentration inequality]
-  \label{lem:kernel_app_dyadic_concentration}
-
-  Let $X_{i j}$ be mutually independent for $1 \leq i < j \leq n$
-  taking values in a measurable space $\cX$.
-  Let $h_1$, $h_2$ be measurable functions from $\cX$ to $\R$
-  satisfying the following for all $i$ and $j$.
-  %
-  \begin{align*}
-    \E\big[h_1(X_{i j})\big]
-    &= 0,
-    &\E\big[h_2(X_{i j})\big]
-    &=0, \\
-    \E\big[h_1(X_{i j})^2\big]
-    &\leq \sigma^2,
-    &\E\big[h_2(X_{i j})^2\big]
-    &\leq \sigma^2, \\
-    \big|h_1(X_{i j})\big|
-    &\leq M,
-    &\big|h_2(X_{i j})\big|
-    &\leq M.
-  \end{align*}
-  %
-  Consider the sum
-  %
-  \begin{align*}
-    S_n
-    &=
-    \sum_{1 \leq i < j < r \leq n}
-    h_1(X_{i j})
-    h_2(X_{i r}).
-  \end{align*}
-  %
-  Then $S_n$ satisfies the concentration inequality
-  %
-  \begin{align*}
-    \P\big(
-      |S_n| \geq t
-    \big)
-    &\leq
-    C \exp\left(
-      -\frac{1}{C}
-      \min \left\{
-        \frac{t^2}{n^3 \sigma^4},
-        \frac{t}{\sqrt{n^3 \sigma^4}},
-        \frac{t^{2/3}}{(n M \sigma)^{2/3}},
-        \frac{t^{1/2}}{M}
-      \right\}
-    \right)
-  \end{align*}
-  %
-  for some universal constant
-  $C > 0$
-  and for all $t>0$.
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_dyadic_concentration}]
-
-  We proceed in three main steps.
-  Firstly, we write $S_n$ as a second-order U-statistic
-  where we use double indices instead of single indices.
-  Then we use a decoupling result to introduce extra independence.
-  Finally, a concentration result is applied
-  to the decoupled U-statistic.
-
-  \proofparagraph{writing $S_n$ as a second-order U-statistic}
-
-  Note that we can write $S_n$ as
-  the second-order U-statistic
-  %
-  \begin{align*}
-    S_n
-    &=
-    \sum_{1 \leq i < j \leq n}
-    \sum_{1 \leq q < r \leq n}
-    h_{i j q r}
-    (X_{i j}, X_{qr}),
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    h_{i j q r}
-    (a,b)
-    &=
-    h_1(a) h_2(b) \,
-    \I\{j<r,\, q=i\}.
-  \end{align*}
-  %
-  Although this may look like a fourth-order
-  U-statistic,
-  it is in fact second-order.
-  This is due to independence of the variables
-  $X_{i j}$,
-  and by treating $(i,j)$ as a single index.
-
-  \proofparagraph{decoupling}
-
-  By the decoupling result of Theorem~1
-  from \citet{delapena1995decoupling}, there exists a universal
-  constant $C_1 > 0$ satisfying
-  %
-  $\P\big( |S_n| \geq t \big)
-  \leq C_1 \P\big( C_1 |\tilde S_n| \geq t \big)$,
-  %
-  where
-  %
-  $\tilde S_n = \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n}
-  h_{i j q r} (X_{i j}, X'_{qr})$,
-  %
-  with $(X'_{i j})$
-  an independent copy of $(X_{i j})$.
-
-  \proofparagraph{U-statistic concentration}
-
-  The U-statistic kernel $h_{i j q r}(X_{i j}, X'_{qr})$
-  is totally degenerate in that
-  %
-  $ \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X_{i j}]
-  = \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X'_{qr}] = 0$.
-  %
-  Define and bound the following quantities:
-  %
-  \pagebreak
-  %
-  \begin{align*}
-    A
-    &=
-    \max_{i j q r}
-    \|h_{i j q r}(X_{i j}, X'_{qr})\|_\infty
-    \leq M^2, \\
-    B
-    &=
-    \max
-    \left\{
-      \left\|
-      \sum_{1 \leq i < j \leq n}
-      \E\Big[
-        h_{i j q r}(X_{i j}, X'_{qr})^2
-        \mid X_{i j}
-      \Big]
-      \right\|_\infty,
-      \left\|
-      \sum_{1 \leq q < r \leq n}
-      \E\Big[
-        h_{i j q r}(X_{i j}, X'_{qr})^2
-        \mid X'_{qr}
-      \Big]
-      \right\|_\infty
-    \right\}^{1/2} \\
-    &=
-    \max
-    \left\{
-      \left\|
-      \sum_{1 \leq i < j \leq n}
-      h_1(X_{i j})^2
-      \E\big[
-        h_2(X_{qr}')^2
-      \big]
-      \I\{j<r, q=i\}
-      \right\|_\infty,
-      \right. \\
-      &\left.
-      \qquad\qquad\quad
-      \left\|
-      \sum_{1 \leq q < r \leq n}
-      \E\big[
-        h_1(X_{i j})^2
-      \big]
-      h_2(X_{qr}')^2
-      \I\{j<r, q=i\}
-      \right\|_\infty
-    \right\}^{1/2} \\
-    &\leq
-    \max
-    \left\{
-      n^2 M^2 \sigma^2,
-      n M^2 \sigma^2
-    \right\}^{1/2}
-    =
-    n M \sigma, \\
-    C
-    &=
-    \left(
-      \sum_{1 \leq i < j \leq n}
-      \sum_{1 \leq q < r \leq n}
-      \!\E\big[
-        h_{i j q r}(X_{i j}, X'_{qr})^2
-      \big]
-    \right)^{\!1/2}
-    \!\!\!\!\! = \left(
-      \sum_{1 \leq i < j < r \leq n}
-      \!\!\E\big[
-        h_1(X_{i j})^2
-        h_2(X_{i r}')^2
-      \big]
-    \right)^{\!1/2}
-    \!\!\!\!\! \leq
-    \sqrt{n^3 \sigma^4}, \\
-    D
-    &=
-    \sup_{f,g} \Bigg\{
-      \sum_{1 \leq i < j \leq n}
-      \sum_{1 \leq q < r \leq n}
-      \E\big[
-        h_{i j q r}(X_{i j}, X'_{qr})
-        f_{i j}(X_{i j})
-        g_{qr}(X'_{qr})
-      \big]
-      \ : \\
-      &\qquad\qquad\quad
-      \sum_{1 \leq i < j \leq n}
-      \E\big[f_{i j}(X_{i j})^2\big]
-      \leq 1, \
-      \sum_{1 \leq q < r \leq n}
-      \E\big[g_{qr}(X'_{qr})^2\big]
-      \leq 1
-    \Bigg\} \\
-    &=
-    \sup_{f,g} \Bigg\{
-      \sum_{1 \leq i < j < r \leq n}
-      \E\big[
-        h_1(X_{i j})
-        f_{i j}(X_{i j})
-      \big]
-      \E\big[
-        h_2(X'_{i r})
-        g_{i r}(X'_{i r})
-      \big]
-      \ : \\
-      &\qquad\qquad\quad
-      \sum_{1 \leq i < j \leq n}
-      \E\big[f_{i j}(X_{i j})^2\big]
-      \leq 1, \
-      \sum_{1 \leq q < r \leq n}
-      \E\big[g_{qr}(X'_{qr})^2\big]
-      \leq 1
-    \Bigg\} \\
-    &\leq
-    \sup_{f,g} \Bigg\{
-      \sum_{1 \leq i < j < r \leq n}
-      \E\big[ h_1(X_{i j})^2 \big]^{1/2}
-      \E\big[ f_{i j}(X_{i j})^2 \big]^{1/2}
-      \E\big[ h_2(X_{i r}')^2 \big]^{1/2}
-      \E\big[ g_{i r}(X'_{i r})^2 \big]^{1/2}
-      \ : \\
-      &\qquad\qquad\quad
-      \sum_{1 \leq i < j \leq n}
-      \E\big[f_{i j}(X_{i j})^2\big]
-      \leq 1, \
-      \sum_{1 \leq q < r \leq n}
-      \E\big[g_{qr}(X'_{qr})^2\big]
-      \leq 1
-    \Bigg\} \\
-    &\leq
-    \sigma^2
-    \sup_{f,g} \Bigg\{
-      \sum_{1 \leq i < j \leq n}
-      \E\big[ f_{i j}(X_{i j})^2 \big]^{1/2}
-      \sum_{1 \leq r \leq n }
-      \E\big[ g_{i r}(X'_{i r})^2 \big]^{1/2}
-      \ : \\
-      &\qquad\qquad\quad
-      \sum_{1 \leq i < j \leq n}
-      \E\big[f_{i j}(X_{i j})^2\big]
-      \leq 1, \
-      \sum_{1 \leq q < r \leq n}
-      \E\big[g_{qr}(X'_{qr})^2\big]
-      \leq 1
-    \Bigg\} \\
-    &\leq
-    \sigma^2
-    \sup_{f,g} \Bigg\{
-      \Bigg(
-        n^2
-        \sum_{1 \leq i < j \leq n}
-        \E\big[ f_{i j}(X_{i j})^2 \big]
-      \Bigg)^{1/2}
-      \Bigg(
-        n
-        \sum_{1 \leq r \leq n }
-        \E\big[ g_{i r}(X'_{i r})^2 \big]
-      \Bigg)^{1/2}
-      \ : \\
-      &\qquad\qquad\quad
-      \sum_{1 \leq i < j \leq n}
-      \E\big[f_{i j}(X_{i j})^2\big]
-      \leq 1, \
-      \sum_{1 \leq q < r \leq n}
-      \E\big[g_{qr}(X'_{qr})^2\big]
-      \leq 1
-    \Bigg\}
-    \leq
-    \sqrt{n^3 \sigma^4}.
-  \end{align*}
-  %
-  By Theorem~3.3 in \citet{gine2000exponential},
-  for some universal constant $C_2 > 0$ and for all $t > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      |\tilde S_n| \geq t
-    \right)
-    &\leq
-    C_2 \exp\left(
-      -\frac{1}{C_2}
-      \min \left\{
-        \frac{t^2}{C^2},
-        \frac{t}{D},
-        \frac{t^{2/3}}{B^{2/3}},
-        \frac{t^{1/2}}{A^{1/2}}
-      \right\}
-    \right) \\
-    &\leq
-    C_2 \exp\left(
-      -\frac{1}{C_2}
-      \min \left\{
-        \frac{t^2}{n^3 \sigma^4},
-        \frac{t}{\sqrt{n^3 \sigma^4}},
-        \frac{t^{2/3}}{(n M \sigma)^{2/3}},
-        \frac{t^{1/2}}{M}
-      \right\}
-    \right).
-  \end{align*}
-
-  \proofparagraph{Conclusion}
-
-  By the previous parts
-  and absorbing constants into a new constant $C > 0$,
-  we therefore have
-  %
-  \begin{align*}
-    \P\left(
-      |S_n| \geq t
-    \right)
-    &\leq
-    C_1 \P\left(
-      C_1 |\tilde S_n| \geq t
-    \right) \\
-    &\leq
-    C_1 C_2 \exp\left(
-      -\frac{1}{C_2}
-      \min \left\{
-        \frac{t^2}{n^3 \sigma^4 C_1^2},
-        \frac{t}{\sqrt{n^3 \sigma^4 C_1}},
-        \frac{t^{2/3}}{(n M \sigma C_1)^{2/3}},
-        \frac{t^{1/2}}{M C_1^{1/2}}
-      \right\}
-    \right) \\
-    &\leq
-    C \exp\left(
-      -\frac{1}{C}
-      \min \left\{
-        \frac{t^2}{n^3 \sigma^4},
-        \frac{t}{\sqrt{n^3 \sigma^4}},
-        \frac{t^{2/3}}{(n M \sigma)^{2/3}},
-        \frac{t^{1/2}}{M}
-      \right\}
-    \right).
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_covariance_estimation}]
-
-  Throughout this proof we will write
-  $k_{i j}$ for $k_h(W_{i j},w)$ and
-  $k_{i j}'$ for $k_h(W_{i j},w')$,
-  in the interest of brevity.
-  Similarly, we write $S_{i j r}$ to denote $S_{i j r}(w,w')$.
-  The estimand and estimator are reproduced below for clarity.
-  %
-  \begin{align*}
-    \Sigma_n(w,w')
-    &=
-    \frac{2}{n(n-1)}
-    \E[k_{i j} k_{i j}']
-    + \frac{4(n-2)}{n(n-1)}
-    \E[k_{i j} k_{i r}']
-    - \frac{4n-6}{n(n-1)}
-    \E[k_{i j}]
-    \E[k_{i j}'] \\
-    \hat \Sigma_n(w,w')
-    &=
-    \frac{2}{n(n-1)}
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    k_{i j}
-    k_{i j}'
-    +
-    \frac{4(n-2)}{n(n-1)}
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    S_{i j r} \\
-    &\quad-
-    \frac{4n-6}{n(n-1)}
-    \hat f_W(w)
-    \hat f_W(w'),
-  \end{align*}
-  %
-  where
-  %
-  $S_{i j r} = \frac{1}{6}
-  \big( k_{i j} k_{i r}'
-    + k_{i j} k_{jr}'
-    + k_{i r} k_{i j}'
-    + k_{i r} k_{jr}'
-    + k_{jr} k_{i j}'
-    + k_{jr} k_{i r}'
-  \big).$
-  %
-  We will prove uniform consistency of each of the three terms separately.
-
-  \proofparagraph{uniform consistency of the $\hat f_W(w) \hat f_W(w')$ term}
-
-  By boundedness of $f_W$ and
-  Theorem~\ref{thm:kernel_uniform_consistency},
-  $\hat f_W$ is uniformly bounded in probability.
-  Noting that
-  $\E[\hat f_W(w)] = \E[k_{i j}]$
-  and by Lemma~\ref{lem:kernel_variance_bounds},
-  %
-  \begin{align*}
-    &\sup_{w,w' \in \cW}
-    \left|
-    \frac{
-      \hat f_W(w) \hat f_W(w')
-    - \E\big[k_{i j}\big] \E\big[k_{i j'}\big]}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    =
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{
-      \hat f_W(w) \hat f_W(w')
-    - \E\big[\hat f_W(w)\big] \E\big[\hat f_W(w')\big]}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right| \\
-    &\quad\leq
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
-    {\sqrt{\Sigma_n(w,w)}}
-    \hat f_W(w')
-    + \frac{\hat f_W(w') - \E\big[\hat f_W(w')\big]}
-    {\sqrt{\Sigma_n(w',w')}}
-    \E\big[\hat f_W(w)]
-    \right| \\
-    &\quad\lesssim_\P
-    \sup_{w \in \cW}
-    \left|
-    \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
-    {\sqrt{\Sigma_n(w,w)}}
-    \right| \\
-    &\quad\lesssim_\P
-    \sup_{w \in \cW}
-    \left|
-    \frac{L_n(w)}
-    {\sqrt{\Sigma_n(w,w)}}
-    \right|
-    + \sqrt{n^2h} \sup_{w \in \cW} \left| Q_n(w) \right|
-    + \sqrt{n^2h} \sup_{w \in \cW} \left| E_n(w) \right| \\
-    &\quad\lesssim_\P
-    \sup_{w \in \cW}
-    \left|
-    \frac{L_n(w)}
-    {\sqrt{\Sigma_n(w,w)}}
-    \right|
-    + \sqrt{n^2h} \frac{1}{n}
-    + \sqrt{n^2h} \sqrt{\frac{\log n}{n^2h}}
-    \lesssim_\P
-    \sup_{w \in \cW}
-    \left|
-    \frac{L_n(w)}
-    {\sqrt{\Sigma_n(w,w)}}
-    \right|
-    + \sqrt{\log n}.
-  \end{align*}
-  %
-  Now consider the function class
-  %
-  \begin{align*}
-    \cF
-    &=
-    \left\{
-      a \mapsto
-      \frac{
-        \E\big[k_h(W_{i j},w) \mid A_i = a \big]
-      - \E\big[k_h(W_{i j},w) \big]}
-      {\sqrt{n \Sigma_n(w,w)}}:
-      w \in \cW
-    \right\},
-  \end{align*}
-  %
-  noting that
-  %
-  \begin{align*}
-    \frac{L_n(w)}
-    {\Sigma_n(w,w)^{1/2}}
-    &=
-    \frac{1}{\sqrt n}
-    \sum_{i=1}^n
-    g_w(A_i)
-  \end{align*}
-  %
-  is an empirical process evaluated at
-  $g_w \in \cF$.
-  By the lower bound on $\Sigma_n(w,w)$
-  from Lemma~\ref{lem:kernel_variance_bounds}
-  with $\inf_\cW f_W(w) > 0$ and since $n h \gtrsim \log n$,
-  the class $\cF$ has a constant envelope function
-  given by $F(a) \lesssim \sqrt{n h}$.
-  Clearly, $M = \sup_a F(a) \lesssim \sqrt{n h}$.
-  Also by definition of $\Sigma_n$
-  and orthogonality of $L_n$, $Q_n$, and $E_n$,
-  we have
-  $\sup_{f \in \cF} \E[f(A_i)^2] \leq \sigma^2 = 1$.
-  To verify a VC-type condition on $\cF$
-  we need to establish the regularity of the process.
-  By Lipschitz properties
-  of $L_n$ and $\Sigma_n$
-  derived in the proofs of Lemma~\ref{lem:kernel_uniform_concentration}
-  and Theorem~\ref{thm:kernel_infeasible_ucb}
-  respectively,
-  we have
-  %
-  \begin{align*}
-    \left|
-    \frac{L_n(w)}
-    {\sqrt{\Sigma_n(w,w)}}
-    - \frac{L_n(w')}
-    {\sqrt{\Sigma_n(w',w')}}
-    \right|
-    &\lesssim
-    \frac{\big|L_n(w) - L_n(w')\big|}
-    {\sqrt{\Sigma_n(w,w)}}
-    +
-    \left| L_n(w') \right|
-    \left|
-    \frac{1}
-    {\sqrt{\Sigma_n(w,w)}}
-    - \frac{1}
-    {\sqrt{\Sigma_n(w',w')}}
-    \right| \\
-    &\lesssim
-    \sqrt{n^2h}
-    |w-w'|
-    +\left|
-    \frac{\Sigma_n(w,w) - \Sigma_n(w',w')}
-    {\Sigma_n(w,w)\sqrt{\Sigma_n(w',w')}}
-    \right| \\
-    &\lesssim
-    \sqrt{n^2h}
-    |w-w'|
-    + (n^2h)^{3/2}
-    \left|
-    \Sigma_n(w,w) - \Sigma_n(w',w')
-    \right| \\
-    &\lesssim
-    \sqrt{n^2h}
-    |w-w'|
-    + (n^2h)^{3/2}
-    n^{-1} h^{-3}
-    |w-w'|
-    \lesssim
-    n^4 |w-w'|,
-  \end{align*}
-  %
-  uniformly over $w,w' \in \cW$. By compactness of $\cW$ we have the covering
-  number bound
-  %
-  $N(\cF, \|\cdot\|_\infty, \varepsilon) \lesssim
-  N(\cW, |\cdot|, n^{-4} \varepsilon) \lesssim n^4 \varepsilon^{-1}$.
-  %
-  Thus by Lemma~\ref{lem:kernel_app_maximal_vc_inid},
-  %
-  \begin{align*}
-    \E \left[
-      \sup_{w \in \cW}
-      \left|
-      \frac{L_n(w)}
-      {\sqrt{\Sigma_n(w,w)}}
-      \right|
-    \right]
-    &\lesssim
-    \sqrt{\log n}
-    + \frac{\sqrt{n h} \log n}{\sqrt{n}}
-    \lesssim
-    \sqrt{\log n}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{
-      \hat f_W(w) \hat f_W(w')
-    - \E\big[k_{i j}\big] \E\big[k_{i j'}\big]}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    &\lesssim_\P
-    \sqrt{\log n}.
-  \end{align*}
-
-  \proofparagraph{decomposition of the $S_{i j r}$ term}
-
-  We first decompose the $S_{i j r}$ term into two parts,
-  and obtain a pointwise concentration result for each.
-  This is extended to a uniform concentration result
-  by considering the regularity of the covariance estimator process.
-  Note that
-  $\E[S_{i j r}] = \E[k_{i j} k_{i r}']$,
-  and hence
-  %
-  \begin{align*}
-    &\frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \big(
-      S_{i j r}
-      - \E[k_{i j} k_{i r}']
-    \big) \\
-    &\quad=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    S_{i j r}^{(1)}
-    + \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    S_{i j r}^{(2)},
-  \end{align*}
-  %
-  where $S_{i j r}^{(1)} = S_{i j r} - \E[S_{i j r} \mid \bA_n]$
-  and $S_{i j r}^{(2)} = \E[S_{i j r} \mid \bA_n] - \E[S_{i j r}]$.
-
-  \proofparagraph{pointwise concentration of the $S_{i j r}^{(1)}$ term}
-
-  By symmetry in $i, j$, and $r$
-  it is sufficient to consider only the first summand
-  in the definition of $S_{i j r}$.
-  By conditional independence properties,
-  we have the decomposition
-  %
-  \begin{align}
-    \nonumber
-    &\frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \Big(
-      k_{i j}k_{i r}'
-      - \E[k_{i j}k_{i r}' \mid \bA_n]
-    \Big) \\
-    \nonumber
-    &\quad=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \Big(
-      k_{i j}k_{i r}'
-      - \E[k_{i j} \mid \bA_n]
-      \E[k_{i r}' \mid \bA_n]
-    \Big) \\
-    \nonumber
-    &\quad=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \Big(
-      \big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \big)
-      \big(
-        k_{i r}'
-        - \E[k_{i r}' \mid \bA_n]
-      \big) \\
-      \nonumber
-      &\qquad+
-      \big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \big)
-      \E[k_{i r}' \mid \bA_n]
-      + \big(
-        k_{i r}'
-        - \E[k_{i r}' \mid \bA_n]
-      \big)
-      \E[k_{i j} \mid \bA_n]
-    \Big) \\
-    \label{eq:kernel_app_Sijr1_decomp1}
-    &\quad=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \Big(
-      k_{i j}
-      - \E[k_{i j} \mid \bA_n]
-    \Big)
-    \Big(
-      k_{i r}'
-      - \E[k_{i r}' \mid \bA_n]
-    \Big) \\
-    \label{eq:kernel_app_Sijr1_decomp2}
-    &\qquad+
-    \frac{2}{(n-1)(n-2)}
-    \sum_{i=1}^{n-2}
-    \sum_{j=i+1}^{n-1}
-    \Big(
-      k_{i j}
-      - \E[k_{i j} \mid \bA_n]
-    \Big)
-    \cdot \frac{3}{n}
-    \sum_{r=j+1}^n
-    \E[k_{i r}' \mid \bA_n] \\
-    \label{eq:kernel_app_Sijr1_decomp3}
-    &\qquad+
-    \frac{2}{(n-1)(n-2)}
-    \sum_{i=1}^{n-2}
-    \sum_{r=i+2}^n
-    \Big(
-      k_{i r}'
-      - \E[k_{i r}' \mid \bA_n]
-    \Big)
-    \cdot \frac{3}{n}
-    \sum_{j=i+1}^{r-1}
-    \E[k_{i j} \mid \bA_n].
-  \end{align}
-  %
-  For the term in \eqref{eq:kernel_app_Sijr1_decomp1},
-  note that conditional on $\bA_n$,
-  we have that
-  $k_{i j} - \E[k_{i j} \mid \bA_n]$
-  are conditionally mean-zero
-  and conditionally independent,
-  as the only randomness is from $\bV_n$.
-  Also
-  $\Var[k_{i j} \mid \bA_n] \lesssim \sigma^2 := 1/h$
-  and
-  $|k_{i j}| \lesssim M := 1/h$
-  uniformly.
-  The same is true for $k_{i j}'$.
-  Thus by Lemma~\ref{lem:kernel_app_dyadic_concentration}
-  for some universal constant $C_1 > 0$:
-  %
-  \begin{align*}
-    &\P\left(
-      \left|
-      \sum_{i<j<r}
-      \Big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \Big)
-      \Big(
-        k_{i r}'
-        - \E[k_{i r}' \mid \bA_n]
-      \Big)
-      \right|
-      > t
-      \biggm\vert \bA_n
-    \right) \\
-    &\quad\leq
-    C_1 \exp\left(
-      -\frac{1}{C_1}
-      \min \left\{
-        \frac{t^2}{n^3 \sigma^4},
-        \frac{t}{\sqrt{n^3 \sigma^4}},
-        \frac{t^{2/3}}{(n M \sigma)^{2/3}},
-        \frac{t^{1/2}}{M}
-      \right\}
-    \right) \\
-    &\quad\leq
-    C_1 \exp\left(
-      -\frac{1}{C_1}
-      \min \left\{
-        \frac{t^2 h^2}{n^3},
-        \frac{t h}{\sqrt{n^3}},
-        \frac{t^{2/3} h}{n^{2/3}},
-        t^{1/2} h
-      \right\}
-    \right),
-  \end{align*}
-  %
-  and therefore
-  with $t \geq 1$
-  and since
-  $n h \gtrsim \log n$,
-  introducing and adjusting a new
-  constant $C_2$ where necessary,
-  %
-  \begin{align*}
-    &\P\left(
-      \left|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      \Big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \Big)
-      \Big(
-        k_{i r}'
-        - \E[k_{i r}' \mid \bA_n]
-      \Big)
-      \right|
-      > t
-      \frac{\log n}{\sqrt{n^3 h^2}}
-      \Bigm\vert \bA_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \left|
-      \sum_{i<j<r}
-      \Big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \Big)
-      \Big(
-        k_{i r}'
-        - \E[k_{i r}' \mid \bA_n]
-      \Big)
-      \right|
-      > t
-      n^{3/2} h^{-1} \log n / 24
-      \Bigm\vert \bA_n
-    \right) \\
-    &\quad\leq
-    C_2 \exp\left(
-      -\frac{1}{C_2}
-      \min \left\{
-        (t \log n)^2,
-        t \log n,
-        (t \log n)^{2/3} (n h)^{1/3},
-        (t n h \log n)^{1/2} n^{1/4}
-      \right\}
-    \right) \\
-    &\quad\leq
-    C_2 \exp\left(
-      -\frac{1}{C_2}
-      \min \left\{
-        t \log n,
-        t \log n,
-        t^{2/3} \log n,
-        t^{1/2} n^{1/4} \log n
-      \right\}
-    \right) \\
-    &\quad=
-    C_2 \exp\left(
-      -\frac{t^{2/3} \log n}{C_2}
-    \right)
-    =
-    C_2
-    n^{-t^{2/3} / C_2}.
-  \end{align*}
-  %
-  Now for the term
-  in \eqref{eq:kernel_app_Sijr1_decomp2},
-  note that
-  $\frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n]$
-  is $\bA_n$-measurable and bounded uniformly in $i,j$.
-  Also, using the previously established conditional variance
-  and almost sure bounds on $k_{i j}$,
-  Bernstein's inequality
-  (Lemma~\ref{lem:kernel_app_bernstein})
-  applied conditionally
-  gives for some constant $C_3 > 0$
-  %
-  \begin{align*}
-    &\P\left(
-      \Bigg|
-      \frac{2}{(n-1)(n-2)}
-      \sum_{i=1}^{n-2}
-      \sum_{j=i+1}^{n-1}
-      \Big(
-        k_{i j}
-        - \E[k_{i j} \mid \bA_n]
-      \Big)
-      \cdot \frac{3}{n}
-      \sum_{r=j+1}^n
-      \E[k_{i r}' \mid \bA_n]
-      \Bigg|
-      > t
-      \sqrt{\frac{\log n}{n^2h}}
-      \Bigm\vert \bA_n
-    \right) \\
-    &\qquad\leq
-    2 \exp \left( -
-      \frac{t^2 n^2 \log n / (n^2h)}
-      {C_3/(2h) + C_3 t \sqrt{\log n / (n^2h)} / (2h)}
-    \right) \\
-    &\qquad=
-    2 \exp \left( -
-      \frac{t^2 \log n}
-      {C_3/2 + C_3 t \sqrt{\log n / (n^2h)} / 2}
-    \right)
-    \leq
-    2 \exp \left( -
-      \frac{t^2 \log n}{C_3}
-    \right)
-    =
-    2 n^{-t^2 / C_3}.
-  \end{align*}
-  %
-  The term in \eqref{eq:kernel_app_Sijr1_decomp3}
-  is controlled in exactly the same way.
-  Putting these together, noting the symmetry in $i,j,r$
-  and taking a marginal expectation,
-  we obtain the unconditional pointwise concentration inequality
-  %
-  \begin{align*}
-    \P\left(
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      S_{i j r}^{(1)}
-      \Bigg|
-      > t
-      \frac{\log n}{\sqrt{n^3h^2}}
-      + t \sqrt{\frac{\log n}{n^2h}}
-    \right)
-    &\leq
-    C_2 n^{-t^{2/3} / C_2}
-    + 4 n^{-t^2 / (4C_3)}.
-  \end{align*}
-  %
-  Multiplying by
-  $\big(\Sigma_n(w,w) + \Sigma_n(w',w')\big)^{-1/2} \lesssim \sqrt{n^2h}$
-  gives (adjusting constants if necessary)
-  %
-  \begin{align*}
-    &\P\left(
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      \frac{S_{i j r}^{(1)}}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \Bigg|
-      > t \frac{\log n}{\sqrt{n h}}
-      + t \sqrt{\log n}
-    \right) \\
-    &\quad\leq
-    C_2 n^{-t^{2/3} / C_2}
-    + 4 n^{-t^2 / (4C_3)}.
-  \end{align*}
-
-  \proofparagraph{pointwise concentration of the $S_{i j r}^{(2)}$ term}
-
-  We apply the U-statistic concentration inequality from
-  Lemma~\ref{lem:kernel_app_ustat_concentration}.
-  Note that the terms
-  $\E[S_{i j r} \mid \bA_n]$
-  are permutation-symmetric functions of
-  the random variables
-  $A_i, A_j$, and $A_r$ only,
-  making $S_{i j r}^{(2)}$ the summands of
-  a (non-degenerate) mean-zero third-order U-statistic.
-  While we could apply a third-order Hoeffding decomposition
-  here to achieve degeneracy,
-  it is unnecessary as Lemma~\ref{lem:kernel_app_ustat_concentration}
-  is general enough to deal with the non-degenerate case directly.
-  The quantity of interest here is
-  %
-  \begin{align*}
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    S_{i j r}^{(2)}
-    &=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \Big(
-      \E[S_{i j r} \mid \bA_n]
-      - \E[S_{i j r}]
-    \Big).
-  \end{align*}
-  %
-  Note that by conditional independence,
-  %
-  \begin{align*}
-    \big|
-    \E\big[
-      k_{i j}k_{i r} \mid \bA_n
-    \big]
-    \big|
-    &=
-    \big|
-    \E\big[
-      k_{i j} \mid \bA_n
-    \big]
-    \E\big[
-      k_{i r} \mid \bA_n
-    \big]
-    \big|
-    \lesssim 1,
-  \end{align*}
-  %
-  and similarly for the other summands in $S_{i j r}$,
-  giving the almost sure bound
-  $|S_{i j r}^{(2)}| \lesssim 1$. Also,
-  %
-  \begin{align*}
-    \Var\big[ \E[k_{i j} \mid A_i] \E[k_{i r}' \mid A_i] \big]
-    &\lesssim
-    \Var\big[\E[k_{i j} \mid A_i]\big]
-    + \Var\big[\E[k_{i r}' \mid A_i]\big] \\
-    &\lesssim
-    n \Var[L_n(w)] + n \Var[L_n(w')] \\
-    &\lesssim
-    n \Sigma_n(w,w) + n \Sigma_n(w',w')
-  \end{align*}
-  %
-  and similarly for the other summands in $S_{i j r}$,
-  giving the conditional variance bound
-  %
-  \begin{align*}
-    \E[\E[S_{i j r}^{(2)} \mid A_i]^2] \lesssim
-    n \Sigma_n(w,w) + n \Sigma_n(w',w').
-  \end{align*}
-  %
-  So Lemma~\ref{lem:kernel_app_ustat_concentration}
-  and Lemma~\ref{lem:kernel_variance_bounds}
-  give the pointwise concentration inequality
-  %
-  \begin{align*}
-    &\P\left(
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      S_{i j r}^{(2)}
-      \Bigg|
-      > t \sqrt{\log n} \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}
-    \right) \\
-    &\quad\leq
-    4 \exp \left(
-      - \frac{n t^2 (\Sigma_n(w,w) + \Sigma_n(w',w')) \log n}
-      {C_4 (n\Sigma_n(w,w) + n\Sigma_n(w',w'))
-      + C_4 t \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}\sqrt{\log n}}
-    \right) \\
-    &\quad\leq
-    4 \exp \left(
-      - \frac{t^2 \log n}
-      {C_4
-      + C_4 t (\Sigma_n(w,w) + \Sigma_n(w',w'))^{-1/2} \sqrt{\log n} / n}
-    \right) \\
-    &\quad\leq
-    4 \exp \left(
-      - \frac{t^2 \log n}
-      {C_4
-      + C_4 t \sqrt{h}}
-    \right)
-    \leq
-    4 n^{-t^2 / C_4}
-  \end{align*}
-  %
-  for some universal constant $C_4 > 0$
-  (which may change from line to line),
-  since the order of this U-statistic is fixed at three.
-
-  \proofparagraph{concentration of the $S_{i j r}$ term on a mesh}
-
-  Pick $\delta_n \to 0$
-  with $\log 1/\delta_n \lesssim \log n$.
-  Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
-  with cardinality $O(1/\delta_n)$.
-  Then $\cW_\delta \times \cW_\delta$
-  is a $2\delta_n$-covering of $\cW \times \cW$
-  with cardinality $O(1/\delta_n^2)$,
-  under the Manhattan metric
-  $d\big((w_1, w_1'), (w_2, w_2')\big)
-  = |w_1 - w_2| + |w_1' - w_2'|$.
-  By the previous parts,
-  we have that for fixed $w$ and $w'$:
-  %
-  \begin{align*}
-    &\P\Bigg(
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      \frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \Bigg|
-      > t \frac{\log n}{\sqrt{n h}}
-      + 2t \sqrt{\log n}
-    \Bigg) \\
-    &\quad\leq
-    C_2 n^{-t^{2/3} / C_2}
-    + 4 n^{-t^2 / (4C_3)}
-    + 4 n^{-t^2 / C_4}.
-  \end{align*}
-  %
-  Taking a union bound over $\cW_\delta \times \cW_\delta$,
-  noting that $n h \gtrsim \log n$
-  and adjusting constants gives
-  %
-  \begin{align*}
-    &\P\Bigg(
-      \sup_{w, w' \in \cW_\delta}
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      \frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \Bigg|
-      > t \sqrt{\log n}
-    \Bigg) \\
-    &\quad\lesssim
-    \delta_n^{-2}
-    \Big(
-      C_2 n^{-t^{2/3} / C_2}
-      + 4 n^{-t^2 / (4C_3)}
-      + 4 n^{-t^2 / C_4}
-    \Big)
-    \lesssim
-    \delta_n^{-2}
-    n^{-t^{2/3} / C_5},
-  \end{align*}
-  %
-  for some constant $C_5 > 0$.
-
-  \proofparagraph{regularity of the $S_{i j r}$ term}
-
-  Next we bound the fluctuations in $S_{i j r}(w,w')$.
-  Writing $k_{i j}(w)$ for $k_h(W_{i j},w)$,
-  note that
-  %
-  \begin{align*}
-    \big|
-    k_{i j}(w_1)
-    k_{i r}(w_1')
-    - k_{i j}(w_2)
-    k_{i r}(w_2')
-    \big|
-    &\lesssim
-    \frac{1}{h}
-    \big| k_{i j}(w_1) - k_{i j}(w_2) \big|
-    +
-    \frac{1}{h}
-    \big| k_{i r}(w_1') - k_{i r}(w_2') \big| \\
-    &\lesssim
-    \frac{1}{h^3}
-    \Big(
-      |w_1 - w_2|
-      + |w_1' - w_2'|
-    \Big),
-  \end{align*}
-  %
-  by the Lipschitz property of the kernel,
-  and similarly for the other summands in $S_{i j r}$.
-  Therefore,
-  %
-  \begin{align*}
-    \sup_{|w_1-w_2| \leq \delta_n}
-    \sup_{|w_1'-w_2'| \leq \delta_n}
-    \big|
-    S_{i j r}(w_1, w_1')
-    - S_{i j r}(w_2, w_2')
-    \big|
-    &\lesssim
-    \delta_n h^{-3}.
-  \end{align*}
-  %
-  Also as noted in the proof of Theorem~\ref{thm:kernel_infeasible_ucb},
-  %
-  \begin{align*}
-    \sup_{|w_1-w_2| \leq \delta_n}
-    \sup_{|w_1'-w_2'| \leq \delta_n}
-    \big|
-    \Sigma_n(w_1,w_1')
-    -
-    \Sigma_n(w_2, w_2')
-    \big|
-    &\lesssim
-    \delta_n n^{-1}h^{-3}.
-  \end{align*}
-  %
-  Therefore, since $\sqrt{\Sigma_n(w,w)} \gtrsim \sqrt{n^2h}$
-  and $|S_{i j r}| \lesssim h^{-2}$,
-  using
-  $\frac{a}{\sqrt b} - \frac{c}{\sqrt d}
-  = \frac{a-c}{\sqrt b} + c \frac{d-b}{\sqrt{b d} \sqrt{b+d}}$,
-  %
-  \begin{align*}
-    &\sup_{|w_1-w_2| \leq \delta_n}
-    \sup_{|w_1'-w_2'| \leq \delta_n}
-    \left|
-    \frac{S_{i j r}(w_1, w_1')}
-    {\sqrt{\Sigma_n(w_1,w_1) + \Sigma_n(w_1',w_1')}}
-    - \frac{S_{i j r}(w_2, w_2')}
-    {\sqrt{\Sigma_n(w_2,w_2) + \Sigma_n(w_2',w_2')}}
-    \right| \\
-    &\quad\lesssim
-    \delta_n h^{-3} \sqrt{n^2h}
-    + h^{-2} \delta_n n^{-1} h^{-3} (n^2h)^{3/2}
-    \lesssim
-    \delta_n n h^{-5/2}
-    + \delta_n n^{2} h^{-7/2}
-    \lesssim
-    \delta_n n^{6},
-  \end{align*}
-  %
-  where in the last line we use that
-  $1/h \lesssim n$.
-
-  \proofparagraph{uniform concentration of the $S_{i j r}$ term}
-
-  By setting
-  $\delta_n = n^{-6} \sqrt{\log n}$,
-  the fluctuations can be at most $\sqrt{\log n}$,
-  so we have for $t \geq 1$
-  %
-  \begin{align*}
-    &\P\Bigg(
-      \sup_{w, w' \in \cW}
-      \Bigg|
-      \frac{6}{n(n-1)(n-2)}
-      \sum_{i<j<r}
-      \frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \Bigg|
-      > 2t \sqrt{\log n}
-    \Bigg) \\
-    &\quad\lesssim
-    \delta_n^{-2}
-    n^{-t^{2/3} / C_5}
-    \lesssim
-    n^{12-t^{2/3} / C_5}.
-  \end{align*}
-  %
-  This converges to zero for any sufficiently large $t$, so
-  %
-  \begin{align*}
-    \sup_{w, w' \in \cW}
-    \Bigg|
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \Bigg|
-    &\lesssim_\P
-    \sqrt{\log n}.
-  \end{align*}
-
-  \proofparagraph{decomposition of the $k_{i j}k_{i j}'$ term}
-
-  We move on to the final term in
-  the covariance estimator.
-  We have the decomposition
-  %
-  \begin{align*}
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    \Big(
-      k_{i j} k_{i j}'
-      - \E\big[k_{i j} k_{i j}']
-    \Big)
-    &=
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    S_{i j}^{(1)}
-    +
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    S_{i j}^{(2)},
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    S_{i j}^{(1)}
-    &=
-    k_{i j} k_{i j}'
-    - \E\big[ k_{i j} k_{i j}' \mid \bA_n \big],
-    &S_{i j}^{(2)}
-    &=
-    \E\big[ k_{i j} k_{i j}' \mid \bA_n \big]
-    - \E\big[ k_{i j} k_{i j}' \big].
-  \end{align*}
-
-  \proofparagraph{pointwise concentration of the $S_{i j}^{(1)}$ term}
-
-  Conditioning on $\bA_n$,
-  the variables $S_{i j}^{(1)}$
-  are conditionally independent
-  and conditionally mean-zero.
-  They further satisfy
-  $|S_{i j}^{(1)}| \lesssim h^{-2}$
-  and the conditional variance bound
-  $\E\big[\big( S_{i j}^{(1)} \big)^2 \mid \bA_n \big] \lesssim h^{-3}$.
-  Therefore applying Bernstein's inequality
-  (Lemma~\ref{lem:kernel_app_bernstein})
-  conditional on $\bA_n$,
-  we obtain the pointwise in $w,w'$
-  concentration inequality
-  %
-  \begin{align*}
-    &\P\left(
-      \Bigg|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      S_{i j}^{(1)}
-      \Bigg|
-      > t
-      \sqrt{\frac{\log n}{n^2h^3}}
-      \Bigm\vert \bA_n
-    \right) \\
-    &\quad\leq
-    2 \exp\left(
-      - \frac{t^2 n^2 \log n / (n^2h^3)}
-      {C_6 h^{-3} / 2 + C_6 t h^{-2} \sqrt{\log n / (n^2h^3)} / 2}
-    \right) \\
-    &\quad\leq
-    2 \exp\left(
-      - \frac{t^2 \log n}
-      {C_6 / 2 + C_6 t \sqrt{\log n / (n^2h)} / 2}
-    \right)
-    \leq
-    2 \exp\left( - \frac{t^2 \log n}{C_6} \right)
-    = 2 n^{-t^2 / C_6},
-  \end{align*}
-  %
-  where $C_6$ is a universal positive constant.
-
-  \proofparagraph{pointwise concentration of the $S_{i j}^{(2)}$ term}
-
-  We apply the U-statistic concentration inequality from
-  Lemma~\ref{lem:kernel_app_ustat_concentration}.
-  Note that $S_{i j}^{(2)}$
-  are permutation-symmetric functions of
-  the random variables
-  $A_i$ and $A_j$ only,
-  making them the summands of
-  a (non-degenerate) mean-zero second-order U-statistic.
-  Note that
-  $\big|S_{i j}^{(2)}\big| \lesssim h^{-1}$
-  and so trivially
-  $\E\big[\E[S_{i j}^{(2)} \mid A_i ]^2 \big] \lesssim h^{-2}$.
-  Thus by Lemma~\ref{lem:kernel_app_ustat_concentration},
-  since the order of this U-statistic is fixed at two,
-  for some universal positive constant $C_7$ we have
-  %
-  \begin{align*}
-    \P\left(
-      \Bigg|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      S_{i j}^{(2)}
-      \Bigg|
-      > t
-      \sqrt{\frac{\log n}{n h^2}}
-    \right)
-    &\leq
-    2 \exp\left(
-      - \frac{t^2 n \log n / (n h^2)}
-      {C_7 h^{-2} / 2 + C_7 t h^{-1} \sqrt{\log n / (n h^2)} / 2}
-    \right) \\
-    &\leq
-    2 \exp\left(
-      - \frac{t^2 \log n}
-      {C_7 / 2 + C_7 t \sqrt{\log n / n} / 2}
-    \right) \\
-    &\leq
-    2 \exp\left(
-      - \frac{t^2 \log n}{C_7}
-    \right)
-    =
-    2 n^{-t^2 / C_7}.
-  \end{align*}
-
-  \proofparagraph{concentration of the $k_{i j}k_{i j}'$ term on a mesh}
-
-  As before, use a union bound
-  on the mesh $\cW_\delta \times \cW_\delta$.
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w,w' \in \cW_\delta}
-      \left|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      \Big(
-        k_{i j} k_{i j}'
-        - \E\big[k_{i j} k_{i j}']
-      \Big)
-      \right|
-      > t \sqrt{\frac{\log n}{n^2h^3}}
-      + t \sqrt{\frac{\log n}{n h^2}}
-    \right) \\
-    &\ \leq
-    \P\!\left(
-      \!\sup_{w,w' \in \cW_\delta}
-      \Bigg|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      S_{i j}^{(1)}
-      \Bigg|
-      > t
-      \sqrt{\frac{\log n}{n^2h^3}}
-    \right)
-    \! + \P\!\left(
-      \!\sup_{w,w' \in \cW_\delta}
-      \Bigg|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      S_{i j}^{(2)}
-      \Bigg|
-      > t
-      \sqrt{\frac{\log n}{n h^2}}
-    \right) \\
-    &\ \lesssim
-    \delta_n^{-2} n^{-t^2 / C_6}
-    + \delta_n^{-2} n^{-t^2 / C_7}.
-  \end{align*}
-
-  \proofparagraph{regularity of the $k_{i j}k_{i j}'$ term}
-
-  As for the $S_{i j r}$ term,
-  %
-  $\big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big|
-  \lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big)$.
-
-  \proofparagraph{uniform concentration of the $k_{i j}k_{i j}'$ term}
-
-  Setting $\delta_n = h^3\sqrt{\log n / (n h^2)}$,
-  the fluctuations are at most $\sqrt{\log n / (n h^2)}$,
-  so for $t \geq 1$
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w,w' \in \cW}
-      \left|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      \Big(
-        k_{i j} k_{i j}'
-        - \E\big[k_{i j} k_{i j}']
-      \Big)
-      \right|
-      > t \sqrt{\frac{\log n}{n^2h^3}}
-      + 2t \sqrt{\frac{\log n}{n h^2}}
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w,w' \in \cW_\delta}
-      \left|
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      \Big(
-        k_{i j} k_{i j}'
-        - \E\big[k_{i j} k_{i j}']
-      \Big)
-      \right|
-      > t \sqrt{\frac{\log n}{n^2h^3}}
-      + t \sqrt{\frac{\log n}{n h^2}}
-    \right) \\
-    &\qquad+
-    \P\left(
-      \sup_{|w_1-w_2| \leq \delta_n}
-      \sup_{|w_1'-w_2'| \leq \delta_n}
-      \big|
-      k_{i j}(w_1)
-      k_{i j}(w_1')
-      - k_{i j}(w_2)
-      k_{i j}(w_2')
-      \big|
-      > t \sqrt{\frac{\log n}{n h^2}}
-    \right) \\
-    &\quad\lesssim
-    \delta_n^{-2} n^{-t^2 / C_6}
-    + \delta_n^{-2} n^{-t^2 / C_7}
-    \lesssim
-    n^{1-t^2 / C_6} h^{-4}
-    + n^{1-t^2 / C_7} h^{-4}
-    \lesssim
-    n^{5-t^2 / C_8},
-  \end{align*}
-  %
-  where $C_8 > 0$ is a constant and
-  in the last line we use $1/h \lesssim n$.
-  This converges to zero for any sufficiently large $t$,
-  so by Lemma~\ref{lem:kernel_variance_bounds} we have
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{2}{n(n-1)}
-    \sum_{i<j}
-    \frac{k_{i j} k_{i j}' - \E\big[k_{i j} k_{i j}']}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    &\lesssim_\P
-    \left(
-      \!\sqrt{\frac{\log n}{n^2h^3}}
-      + \sqrt{\frac{\log n}{n h^2}}
-    \right)
-    \sqrt{n^2h}
-    \lesssim_\P
-    \sqrt{\frac{n \log n}{h}}.
-  \end{align*}
-
-  \proofparagraph{conclusion}
-
-  By the uniform bounds derived in the previous parts,
-  and with $n h \gtrsim \log n$, we conclude that
-  %
-  \begin{align*}
-    &\sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    \leq
-    \frac{2}{n(n-1)}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{2}{n(n-1)}
-    \!\sum_{i<j}\!
-    \frac{k_{i j} k_{i j}' - \E\big[k_{i j} k_{i j}']}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right| \\
-    &\qquad+
-    \frac{4(n-2)}{n(n-1)}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i<j<r}
-    \frac{S_{i j r} - \E\big[k_{i j} k_{i r}']}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right| \\
-    &\qquad+
-    \frac{4n-6}{n(n-1)}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat f_W(w) \hat f_W(w') - \E[k_{i j}] \E[k_{i j}']}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right| \\
-    &\quad\lesssim_\P
-    \sqrt{\frac{\log n}{n^3h}}
-    + \frac{\sqrt{\log n}}{n}
-    + \frac{\sqrt{\log n}}{n}
-    \lesssim_\P
-    \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_alternative_covariance_estimator}]
-
-  Write $k_{i j}$ for $k_h(W_{i j},w)$
-  if $i<j$ and $k_h(W_{j i},w)$ if $j<i$,
-  and use a prime to denote evaluation at $w'$.
-  Thus we write $S_i(w) = \frac{1}{n-1} \sum_{j \neq i} k_{i j}$.
-  Let $\sum_{i \neq j \neq r}$ indicate all indices are distinct.
-  %
-  \begin{align*}
-    \frac{4}{n^2}
-    \sum_{i=1}^n
-    S_i(w) S_i(w')
-    &=
-    \frac{4}{n^2}
-    \sum_{i=1}^n
-    \frac{1}{n-1}
-    \sum_{j \neq i}
-    k_{i j}
-    \frac{1}{n-1}
-    \sum_{r \neq i}
-    k_{i r}'
-    =
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i=1}^n
-    \sum_{j \neq i}
-    \sum_{r \neq i}
-    k_{i j}
-    k_{i r}' \\
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i=1}^n
-    \sum_{j \neq i}
-    \left(
-      \sum_{r \neq i, r \neq j}
-      k_{i j}
-      k_{i r}'
-      + k_{i j}
-      k_{i j}'
-    \right) \\
-    &=
-    \frac{4}{n^2(n-1)^2}
-    \sum_{i \neq j \neq r}
-    k_{i j}
-    k_{i r}'
-    + \frac{4}{n^2(n-1)^2}
-    \sum_{i \neq j}
-    k_{i j}
-    k_{i j}' \\
-    &=
-    \frac{24}{n^2(n-1)^2}
-    \sum_{i < j < r}
-    S_{i j r}(w,w')
-    + \frac{8}{n^2(n-1)^2}
-    \sum_{i < j}
-    k_{i j}
-    k_{i j}' \\
-    &=
-    \hat \Sigma_n(w,w')
-    + \frac{4}{n^2(n-1)^2}
-    \sum_{i < j}
-    k_{i j}
-    k_{i r}'
-    + \frac{4n-6}{n(n-1)}
-    \hat f
-    \hat f'.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_sdp}]
-
-  Firstly, we prove that the true covariance function
-  $\Sigma_n$
-  is feasible for the optimization problem
-  \eqref{eq:kernel_app_sdp} in the sense that it satisfies the constraints.
-  As a covariance function, it is symmetric and positive semi-definite.
-  The Lipschitz constraint is established in the proof of
-  Theorem~\ref{thm:kernel_infeasible_ucb}:
-  %
-  \begin{align*}
-    \big| \Sigma_n(w,w') - \Sigma_n(w, w'') \big|
-    &\leq
-    \frac{4}{n h^3}
-    C_\rk
-    C_\rL
-    |w'-w''|
-  \end{align*}
-  %
-  for all $w,w',w'' \in \cW$.
-  Denote the (random) objective function
-  in \eqref{eq:kernel_app_sdp} by
-  %
-  \begin{align*}
-    \objective(M) = \sup_{w,w' \in \cW}
-    \left|
-    \frac{M(w,w') - \hat\Sigma_n(w,w')}
-    {\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
-    \right|.
-  \end{align*}
-  %
-  By Lemma~\ref{lem:kernel_app_covariance_estimation}
-  with $w = w'$ we deduce that
-  $\sup_{w \in \cW}
-  \left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right|
-  \lesssim_\P \sqrt{h \log n}$
-  and so
-  %
-  \begin{align*}
-    \objective(\Sigma_n)
-    &= \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat\Sigma_n(w,w') - \Sigma_n}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
-    {\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}} \\
-    &\lesssim_\P
-    \frac{\sqrt{\log n}}{n}
-    \left(
-      1 - \frac{\big|\hat \Sigma_n(w,w) - \Sigma_n(w,w)\big|}
-      {\Sigma_n(w,w)}
-      - \frac{\big|\hat \Sigma_n(w',w') - \Sigma_n(w',w')\big|}
-      {\Sigma_n(w',w')}
-    \right)^{-1/2} \\
-    &\lesssim_\P
-    \frac{\sqrt{\log n}}{n}
-    \left(
-      1 - \sqrt{h \log n}
-    \right)^{-1/2}
-    \lesssim_\P
-    \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-  %
-  Since the objective function
-  is non-negative and because we have established
-  at least one feasible function $M$ with
-  an almost surely finite objective value,
-  we can conclude the following.
-  Let $\objective^* = \inf_M \objective(M)$,
-  where the infimum is over feasible functions $M$.
-  Then for all $\varepsilon > 0$
-  there exists a feasible function $M_\varepsilon$ with
-  $\objective(M_\varepsilon) \leq \objective^* + \varepsilon$,
-  and we call such a solution $\varepsilon$-optimal.
-  Let $\hat \Sigma_n^+$ be an $n^{-1}$-optimal solution.
-  Then
-  %
-  \begin{align*}
-    \objective(\hat \Sigma_n^+)
-    &\leq \objective^* + n^{-1}
-    \leq \objective(\Sigma_n) + n^{-1}.
-  \end{align*}
-  %
-  Thus by the triangle inequality,
-  %
-  \begin{align*}
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-    \right|
-    &\leq
-    \objective(\hat \Sigma_n^+)
-    + \objective(\Sigma_n)
-    \leq 2 \, \objective(\Sigma_n) + n^{-1}
-    \lesssim_\P
-    \frac{\sqrt{\log n}}{n}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_variance_estimator_bounds}]
-
-  Since $\hat \Sigma_n^+$ is positive semi-definite,
-  we must have $\hat \Sigma_n^+(w,w) \geq 0$.
-  Now Lemma~\ref{lem:kernel_app_sdp}
-  implies that for all $\varepsilon \in (0,1)$
-  there exists a $C_\varepsilon$ such that
-  %
-  \begin{align*}
-    &\P\left(
-      \Sigma_n(w,w) - C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)}
-      \leq
-      \hat \Sigma_n^+(w,w)
-      \right.
-      \\
-      &\left.
-      \qquad\leq
-      \Sigma_n(w,w) + C_\varepsilon \frac{\sqrt{\log n}}{n}
-      \sqrt{\Sigma_n(w,w)},
-      \quad \forall w \in \cW
-    \right)
-    \geq 1-\varepsilon.
-  \end{align*}
-  %
-  Consider the function
-  $g_a(t) = t - a \sqrt{t}$
-  and note that it is increasing on $\{t \geq a^2/4\}$.
-  Applying this with $t = \Sigma_n(w,w)$
-  and $a = \frac{\sqrt{\log n}}{n}$,
-  noting that by Lemma~\ref{lem:kernel_variance_bounds} we have
-  $t = \Sigma_n(w,w) \gtrsim \frac{1}{n^2h}
-  \gg \frac{\log n}{4n^2} = a^2/4$,
-  shows that for $n$ large enough,
-  %
-  \begin{align*}
-    \inf_{w \in \cW} \Sigma_n(w,w)
-    - \frac{\sqrt{\log n}}{n} \sqrt{\inf_{w \in \cW} \Sigma_n(w,w)}
-    \lesssim_\P
-    \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\
-    \sup_{w \in \cW}\hat \Sigma_n^+(w,w)
-    \lesssim_\P
-    \sup_{w \in \cW} \Sigma_n(w,w)
-    + \frac{\sqrt{\log n}}{n} \sqrt{\sup_{w \in \cW} \Sigma_n(w,w)}.
-  \end{align*}
-  %
-  Applying the bounds from Lemma~\ref{lem:kernel_variance_bounds}
-  yields
-  %
-  \begin{align*}
-    \frac{\Dl^2}{n} + \frac{1}{n^2h}
-    - \frac{\sqrt{\log n}}{n}
-    \left( \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right)
-    \lesssim_\P
-    \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\
-    \sup_{w \in \cW}\hat \Sigma_n^+(w,w)
-    \lesssim_\P
-    \frac{\Du^2}{n} + \frac{1}{n^2h}
-    + \frac{\sqrt{\log n}}{n}
-    \left( \frac{\Du}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right)
-  \end{align*}
-  %
-  and so
-  %
-  \begin{align*}
-    \frac{\Dl^2}{n} + \frac{1}{n^2h}
-    \lesssim_\P
-    \inf_{w \in \cW}\hat \Sigma_n^+(w,w)
-    \leq
-    \sup_{w \in \cW}\hat \Sigma_n^+(w,w)
-    \lesssim_\P
-    \frac{\Du^2}{n} + \frac{1}{n^2h}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_sdp}]
-  See Lemma~\ref{lem:kernel_app_covariance_estimation}
-  and Lemma~\ref{lem:kernel_app_sdp}.
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_studentized_t_statistic}]
-  %
-  We have
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left| \hat T_n(w) - T_n(w) \right|
-    =
-    \sup_{w \in \cW}
-    \bigg\{
-      \left|
-      \hat f_W(w) - f_W(w)
-      \right|
-      \cdot
-      \bigg|
-      \frac{1}
-      {\hat\Sigma_n^+(w,w)^{1/2}}
-      -
-      \frac{1}{\Sigma_n(w,w)^{1/2}}
-      \bigg|
-    \bigg\} \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \left|
-    \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
-    {\sqrt{\Sigma_n(w,w)}}
-    + \frac{\E\big[\hat f_W(w)\big] - f_W(w)}
-    {\sqrt{\Sigma_n(w,w)}}
-    \right|
-    \cdot \sup_{w \in \cW}
-    \left|
-    \frac{\hat\Sigma_n^+(w,w) - \Sigma_n(w,w)}
-    {\sqrt{\Sigma_n(w,w) \hat\Sigma_n^+(w,w)}}
-    \right|.
-  \end{align*}
-  %
-  Now from the proof of Lemma~\ref{lem:kernel_app_covariance_estimation} we
-  have that
-  $\sup_{w \in \cW} \left|
-  \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
-  {\sqrt{\Sigma_n(w,w)}} \right|
-  \lesssim_\P \sqrt{\log n}$,
-  while Theorem~\ref{thm:kernel_bias} gives
-  $\sup_{w \in \cW} \big| \E\big[\hat f_W(w)\big] - f_W(w) \big|
-  \lesssim h^{p \wedge \beta}$.
-  By Lemma~\ref{lem:kernel_variance_bounds},
-  note that
-  $\sup_{w \in \cW} \Sigma_n(w,w)^{-1/2}
-  \lesssim \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$, and
-  $\sup_{w \in \cW} \hat \Sigma_n^+(w,w)^{-1/2}
-  \lesssim_\P \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$
-  by Lemma~\ref{lem:kernel_app_variance_estimator_bounds}.
-  Thus, applying Lemma~\ref{lem:kernel_app_sdp} to control the
-  covariance estimation error,
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \hat T_n(w) - T_n(w) \right|
-    &\lesssim_\P
-    \left(
-      \sqrt{\log n} + \frac{h^{p \wedge \beta}}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}
-    \right)
-    \frac{\sqrt{\log n}}{n}
-    \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \\
-    &\lesssim_\P
-    \sqrt{\frac{\log n}{n}}
-    \left(
-      \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
-      {\Dl + 1/\sqrt{n h}}
-    \right)
-    \frac{1}{\Dl + 1/\sqrt{n h}}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[%
-  Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}]
-
-  Firstly, note that $\hat Z_n^T$ exists
-  by noting that $\hat \Sigma_n^+(w,w')$ and therefore also
-  $\frac{\hat \Sigma_n^+(w,w')}
-  {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$
-  are positive semi-definite
-  functions and appealing to the
-  Kolmogorov consistency theorem \citep{gine2021mathematical}.
-  To obtain the desired Kolmogorov--Smirnov result we discretize and
-  use the Gaussian--Gaussian comparison result found in
-  Lemma~3.1 in \citet{chernozhukov2013gaussian}.
-
-  \proofparagraph{bounding the covariance discrepancy}
-
-  Define the maximum discrepancy in the (conditional) covariances
-  of $\hat Z_n^T$ and $Z_n^T$ by
-  %
-  \begin{align*}
-    \Delta
-    &\vcentcolon=
-    \sup_{w, w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n^+(w,w')}
-    {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-    - \frac{\Sigma_n(w,w')}
-    {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}
-    \right|.
-  \end{align*}
-  %
-  This variable can be bounded in probability
-  in the following manner.
-  First note that by the Cauchy--Schwarz inequality
-  for covariances,
-  $|\Sigma_n(w,w')| \leq
-  \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}$.
-  Hence
-  %
-  \begin{align*}
-    \Delta
-    &\leq
-    \sup_{w, w' \in \cW}
-    \left\{
-      \left|
-      \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-      {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-      \right|
-      + \left|
-      \frac{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}
-      - \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}
-      {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-      \right|
-    \right\} \\
-    &\leq
-    \sup_{w, w' \in \cW}
-    \left\{
-      \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
-      {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-      \left|
-      \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \right|
-    \right\} \\
-    &\quad+
-    \sup_{w, w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n^+(w,w)\hat \Sigma_n^+(w',w')
-    - \Sigma_n(w,w) \Sigma_n(w',w')}
-    {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
-    \Sigma_n(w,w) \Sigma_n(w',w')}}
-    \right|.
-  \end{align*}
-  %
-  For the first term, note that
-  $\inf_{w \in \cW} \hat \Sigma_n^+(w,w)
-  \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}$
-  by Lemma~\ref{lem:kernel_app_variance_estimator_bounds} and also
-  $\sup_{w \in \cW}
-  \left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right|
-  \lesssim_\P \sqrt{h \log n}$
-  by the proof of Lemma~\ref{lem:kernel_app_sdp}.
-  Thus by Lemma~\ref{lem:kernel_app_sdp},
-  %
-  \begin{align*}
-    &\sup_{w, w' \in \cW}
-    \left\{
-      \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
-      {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-      \left|
-      \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
-      {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
-      \right|
-    \right\} \\
-    &\quad\lesssim_\P
-    \frac{\sqrt{\log n}}{n}
-    \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}
-    \lesssim_\P
-    \sqrt{\frac{\log n}{n}}
-    \frac{1}{\Dl + 1/\sqrt{n h}}.
-  \end{align*}
-  %
-  For the second term, we have by the same bounds
-  %
-  \begin{align*}
-    &\sup_{w, w' \in \cW}
-    \left|
-    \frac{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
-    - \Sigma_n(w,w) \Sigma_n(w',w')}
-    {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
-    \Sigma_n(w,w) \Sigma_n(w',w')}}
-    \right| \\
-    &\quad\leq
-    \sup_{w, w' \in \cW}
-    \left\{
-      \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|
-      \hat \Sigma_n^+(w',w')}
-      {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
-      \Sigma_n(w,w) \Sigma_n(w',w')}}
-    \right\} \\
-    &\qquad+
-    \sup_{w, w' \in \cW}
-    \left\{
-      \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|
-      \Sigma_n(w,w)}
-      {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
-      \Sigma_n(w,w) \Sigma_n(w',w')}}
-    \right\} \\
-    &\quad\leq
-    \sup_{w, w' \in \cW}
-    \left\{
-      \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|}
-      {\sqrt{\Sigma_n(w,w)}}
-      \frac{\sqrt{\hat \Sigma_n^+(w',w')}}
-      {\sqrt{\hat \Sigma_n^+(w,w) \Sigma_n(w',w')}}
-    \right\} \\
-    &\qquad+
-    \!\sup_{w, w' \in \cW}\!
-    \left\{
-      \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|}
-      {\sqrt{\Sigma_n(w',w')}}
-      \frac{\sqrt{\Sigma_n(w,w)}}
-      {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
-    \right\}
-    \lesssim_\P
-    \sqrt{\frac{\log n}{n}}
-    \frac{1}{\Dl + 1/\sqrt{n h}}.
-  \end{align*}
-  %
-  Therefore
-  $\Delta \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}$.
-
-  \proofparagraph{Gaussian comparison on a mesh}
-
-  Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
-  with cardinality $O(1/\delta_n)$,
-  where $1/\delta_n$ is at most polynomial in $n$.
-  The scaled (conditionally) Gaussian
-  processes $Z_n^T$ and $\hat Z_n^T$
-  both have pointwise (conditional) variances of 1.
-  Therefore, by Lemma~3.1 in \citet{chernozhukov2013gaussian},
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      Z_n^T(w)
-      \leq t
-    \right)
-    - \P\left(
-      \sup_{w \in \cW_\delta}
-      \hat Z_n^T(w)
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \right|
-    &\lesssim
-    \Delta^{1/3}
-    \Big(
-      1 \vee \log \frac{1}{\Delta \delta_n}
-    \Big)^{2/3}
-  \end{align*}
-  %
-  uniformly in the data. By the previous part and
-  since $x (\log 1/x)^2$ is increasing on $\big(0, e^{-2}\big)$,
-  %
-  \begin{align*}
-    &\sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      Z_n^T(w)
-      \leq t
-    \right)
-    - \P\left(
-      \sup_{w \in \cW_\delta}
-      \hat Z_n^T(w)
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \right| \\
-    &\quad\lesssim_\P
-    \left(
-      \sqrt{\frac{\log n}{n}}
-      \frac{1}{\Dl + 1/\sqrt{n h}}
-    \right)^{1/3}
-    (\log n)^{2/3}
-    \lesssim_\P
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}.
-  \end{align*}
-
-  \proofparagraph{trajectory regularity of $Z_n^T$}
-
-  In the proof of Theorem~\ref{thm:kernel_infeasible_ucb}
-  we established that $Z_n^T$ satisfies the regularity property
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \big| Z_n^T(w) - Z_n^T(w') \big|
-    \right]
-    &\lesssim
-    n h^{-1}
-    \sqrt{\delta_n \log n},
-  \end{align*}
-  %
-  whenever $1/\delta_n$
-  is at most polynomial in $n$.
-
-  \proofparagraph{conditional $L^2$ regularity of $\hat Z_n^T$}
-
-  By Lemma~\ref{lem:kernel_app_sdp},
-  with $n h \gtrsim \log n$,
-  we have
-  uniformly in $w,w'$,
-  %
-  \begin{align*}
-    \big|
-    \hat \Sigma_n^+(w,w')
-    - \hat \Sigma_n^+(w,w)
-    \big|
-    &\lesssim
-    n^{-1} h^{-3} |w-w'|.
-  \end{align*}
-  %
-  Taking
-  $\delta_n \leq n^{-2} h^2$,
-  Lemma~\ref{lem:kernel_app_variance_estimator_bounds}
-  gives
-  %
-  \begin{align*}
-    \inf_{|w-w'| \leq \delta_n}
-    \hat \Sigma_n^+(w,w')
-    \gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}
-    - n^{-1} h^{-3} \delta_n
-    \gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}
-    - \frac{1}{n^3h}
-    \gtrsim
-    \frac{\Dl^2}{n}
-    + \frac{1}{n^2h}.
-  \end{align*}
-  %
-  The conditional $L^2$
-  regularity of $\hat Z_n^T$ is
-  %
-  \begin{align*}
-    \E\left[
-      \big(
-        \hat Z_n^T(w) - \hat Z_n^T(w')
-      \big)^2
-      \bigm\vert \bW_n
-    \right]
-    &=
-    2 - 2
-    \frac{\hat \Sigma_n^+(w,w')}
-    {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}.
-  \end{align*}
-  %
-  Applying the same elementary result as for $Z_n^T$
-  in the proof of Theorem~\ref{thm:kernel_infeasible_ucb} yields
-  %
-  \begin{align*}
-    \E\left[
-      \big(
-        \hat Z_n^T(w) - \hat Z_n^T(w')
-      \big)^2
-      \bigm\vert \bW_n
-    \right]
-    &\lesssim_\P
-    n^2 h^{-2} |w-w'|.
-  \end{align*}
-  %
-  Thus the conditional semimetric
-  induced by $\hat Z_n^T$ on $\cW$ is
-  %
-  \begin{align*}
-    \hat\rho(w,w')
-    &\vcentcolon=
-    \E\left[
-      \big(
-        \hat Z_n^T(w) - \hat Z_n^T(w')
-      \big)^2
-      \bigm\vert \bW_n
-    \right]^{1/2}
-    \lesssim_\P
-    n h^{-1} \sqrt{|w-w'|}.
-  \end{align*}
-
-  \proofparagraph{conditional trajectory regularity of $\hat Z_n^T$}
-
-  As for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb},
-  we apply Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  now conditionally, to obtain
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{|w-w'| \leq \delta_n}
-      \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right|
-      \Bigm\vert \bW_n
-    \right]
-    &\lesssim_\P
-    n h^{-1}
-    \sqrt{\delta_n \log n},
-  \end{align*}
-  %
-  whenever $1/\delta_n$
-  is at most polynomial in $n$.
-
-  \proofparagraph{uniform Gaussian comparison}
-
-  Now we use the trajectory regularity properties to
-  extend the Gaussian--Gaussian comparison result from a finite mesh
-  to all of $\cW$.
-  Write the previously established
-  approximation rate as
-  %
-  \begin{align*}
-    r_n
-    &=
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}.
-  \end{align*}
-  %
-  Take $\varepsilon_n > 0$ and observe that
-  uniformly in $t \in \R$,
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      \leq t
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      \big| \hat Z_n^T(w) \big|
-      \leq t + \varepsilon_n
-      \Bigm\vert \bW_n
-    \right)
-    + \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      \big| Z_n^T(w) \big|
-      \leq t + \varepsilon_n
-    \right)
-    + O_\P(r_n)
-    + \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t + 2\varepsilon_n
-    \right)
-    + O_\P(r_n)
-    + \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      Z_n^T(w)
-      - Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-    \right) \\
-    &\qquad+
-    \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t + 2\varepsilon_n
-    \right)
-    + O_\P(r_n)
-    + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t
-    \right)
-    + \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq 2\varepsilon_n
-    \right) \\
-    &\qquad+
-    O_\P(r_n)
-    + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}).
-  \end{align*}
-  %
-  The converse inequality is obtained
-  analogously as follows:
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      \leq t
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      \big| \hat Z_n^T(w) \big|
-      \leq t - \varepsilon_n
-      \Bigm\vert \bW_n
-    \right)
-    - \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW_\delta}
-      \big| Z_n^T(w) \big|
-      \leq t - \varepsilon_n
-    \right)
-    - O_\P(r_n)
-    - \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t - 2\varepsilon_n
-    \right)
-    - O_\P(r_n)
-    - \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      Z_n^T(w)
-      - Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-    \right) \\
-    &\qquad-
-    \P\left(
-      \sup_{|w-w'| \leq \delta_n}
-      \left|
-      \hat Z_n^T(w)
-      - \hat Z_n^T(w')
-      \right|
-      \geq \varepsilon_n
-      \Bigm\vert \bW_n
-    \right) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t - 2\varepsilon_n
-    \right)
-    - O_\P(r_n)
-    - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t
-    \right)
-    - \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq 2\varepsilon_n
-    \right) \\
-    &\qquad-
-    O_\P(r_n)
-    - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}).
-  \end{align*}
-  %
-  Combining these uniform upper and lower bounds gives
-  %
-  \begin{align*}
-    &\sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t
-    \right)
-    \right| \\
-    &\qquad\lesssim_\P
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq 2\varepsilon_n
-    \right)
-    + r_n
-    + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}.
-  \end{align*}
-  %
-  For the remaining term, apply anti-concentration
-  for $Z_n^T$ from the proof of Theorem~\ref{thm:kernel_infeasible_ucb}:
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      - t
-      \right|
-      \leq \varepsilon
-    \right)
-    &\lesssim
-    \varepsilon
-    \sqrt{\log n}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    &\sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t
-    \right)
-    \right| \\
-    &\qquad\lesssim_\P
-    \varepsilon_n \sqrt{\log n}
-    + r_n
-    + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}.
-  \end{align*}
-  %
-  Taking $\varepsilon = r_n / \sqrt{\log n}$
-  and then $\delta_n = n^{-2} h r_n^2 \varepsilon_n^2 / \log n$
-  yields
-  %
-  \begin{align*}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \big| Z_n^T(w) \big|
-      \leq t
-    \right)
-    \right|
-    &\lesssim_\P
-    r_n =
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}]
-
-  \proofparagraph{Kolmogorov--Smirnov approximation}
-
-  Let $Z_n^T$ and $\hat Z_n^T$ be defined
-  as in the proof of
-  Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
-  Write
-  %
-  \begin{align*}
-    r_n
-    &=
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}
-  \end{align*}
-  %
-  for the rate of approximation from
-  Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
-  For any $\varepsilon_n > 0$ and uniformly in $t \in \R$:
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \left|
-      \hat Z_n^T(w)
-      \right|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq t
-    \right)
-    +
-    O_\P(r_n) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq t - \varepsilon_n
-    \right)
-    +
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big|
-      Z_n^T(w)
-      \big|
-      -t
-      \right|
-      \leq \varepsilon_n
-    \right)
-    +
-    O_\P(r_n) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    +
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) - Z_n^T(w) \right|
-      \geq \varepsilon_n
-    \right) \\
-    &\qquad+
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big|
-      Z_n^T(w)
-      \big|
-      -t
-      \right|
-      \leq \varepsilon_n
-    \right)
-    +
-    O_\P(r_n) \\
-    &\quad\leq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    +
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) - Z_n^T(w) \right|
-      \geq \varepsilon_n
-    \right)
-    + \varepsilon_n \sqrt{\log n}
-    + O_\P(r_n),
-  \end{align*}
-  %
-  where in the last line we used the anti-concentration result
-  from Lemma~\ref{lem:kernel_app_anticoncentration}
-  applied to $Z_n^T$,
-  as in the proof of
-  Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
-  The corresponding lower bound is as follows:
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \left|
-      \hat Z_n^T(w)
-      \right|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq t
-    \right)
-    -
-    O_\P(r_n) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      Z_n^T(w)
-      \right|
-      \leq t + \varepsilon_n
-    \right)
-    -
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big|
-      Z_n^T(w)
-      \big|
-      -t
-      \right|
-      \leq \varepsilon_n
-    \right)
-    -
-    O_\P(r_n) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) - Z_n^T(w) \right|
-      \geq \varepsilon_n
-    \right) \\
-    &\qquad-
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big|
-      Z_n^T(w)
-      \big|
-      -t
-      \right|
-      \leq \varepsilon_n
-    \right)
-    -
-    O_\P(r_n) \\
-    &\quad\geq
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) - Z_n^T(w) \right|
-      \geq \varepsilon_n
-    \right)
-    - \varepsilon_n \sqrt{\log n}
-    - O_\P(r_n).
-  \end{align*}
-
-  \proofparagraph{$t$-statistic approximation}
-
-  To control the remaining term,
-  note that by
-  Theorem~\ref{thm:kernel_strong_approx_Tn}
-  and Lemma~\ref{lem:kernel_app_studentized_t_statistic},
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left| \hat T_n(w) - Z_n^T(w) \right| \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \left| \hat T_n(w) - T_n(w) \right|
-    + \sup_{w \in \cW}
-    \left| T_n(w) - Z_n^T(w) \right| \\
-    &\quad\lesssim_\P
-    \sqrt{\frac{\log n}{n}}
-    \left(
-      \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
-      {\Dl + 1/\sqrt{n h}}
-    \right)
-    \frac{1}{\Dl + 1/\sqrt{n h}} \\
-    &\qquad+
-    \frac{
-      n^{-1/2} \log n
-      + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
-      + n^{-2/3} h^{-1/2} (\log n)^{2/3}
-    + n^{1/2} h^{p \wedge \beta}}
-    {\Dl + 1/\sqrt{n h}}
-  \end{align*}
-  %
-  and denote this last quantity by $r_n'$.
-  Then for any $\varepsilon_n \gg r_n'$,
-  we have
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right)
-    - \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \hat Z_n^T(w)
-      \right|
-      \leq t
-      \Bigm\vert \bW_n
-    \right)
-    \right|
-    &\lesssim_\P
-    \varepsilon_n \sqrt{\log n}
-    + r_n
-    + o(1).
-  \end{align*}
-
-  \proofparagraph{rate analysis}
-
-  This rate is $o_\P(1)$
-  with an appropriate choice of $\varepsilon_n$ whenever
-  $r_n \to 0$ and $r_n' \sqrt{\log n} \to 0$,
-  by Lemma~\ref{lem:kernel_app_slow_convergence}, along with
-  a slowly diverging sequence $R_n$. Explicitly, we require the following.
-  %
-  \begin{align*}
-    \frac{n^{-1/2} (\log n)^{3/2}}{\Dl + 1/\sqrt{n h}}
-    &\to 0,
-    &\frac{h^{p \wedge \beta} \log n}{\Dl^2 + (n h)^{-1}}
-    &\to 0, \\
-    \frac{n^{-1/2} (\log n)^{3/2}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0,
-    &\frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0, \\
-    \frac{n^{-2/3} h^{-1/2} (\log n)^{7/6}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0,
-    &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0, \\
-    \frac{n^{-1/6}(\log n)^{5/6}}
-    {\Dl^{1/3} + (n h)^{-1/6}}
-    &\to 0.
-  \end{align*}
-  %
-  Using the fact that $h \lesssim n^{-\varepsilon}$
-  for some $\varepsilon > 0$
-  and removing trivial statements leaves us with
-  %
-  \begin{align*}
-    \frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0,
-    &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}}
-    {\Dl + 1/\sqrt{n h}}
-    &\to 0.
-  \end{align*}
-  %
-  We analyze these based on the degeneracy
-  and verify that they hold under Assumption~\ref{ass:kernel_rates}.
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item No degeneracy:
-      if $\Dl > 0$ then we need
-      %
-      \begin{align*}
-        n^{-3/4} h^{-7/8} (\log n)^{7/8}
-        &\to 0,
-        &n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}
-        &\to 0.
-      \end{align*}
-      %
-      These reduce to
-      $n^{-6/7} \log n \ll h
-      \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$.
-
-    \item Partial or total degeneracy:
-      if $\Dl = 0$ then we need
-      %
-      \begin{align*}
-        n^{-1/4} h^{-3/8} (\log n)^{7/8}
-        &\to 0,
-        &n h^{(p \wedge \beta) + 1/2} (\log n)^{1/2}
-        &\to 0.
-      \end{align*}
-      %
-      These reduce to
-      $n^{-2/3} (\log n)^{7/3} \ll h
-      \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$.
-      %
-  \end{enumerate}
-
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_ucb}]
-
-  \proofparagraph{existence of the conditional quantile}
-
-  We argue as in the proof of
-  Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian},
-  now also conditioning on the data.
-  In particular, using the anti-concentration result from
-  Lemma~\ref{lem:kernel_app_anticoncentration},
-  the regularity property of $\hat Z_n^T$,
-  and the Gaussian process maximal inequality from
-  Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
-  we see that for any $\varepsilon > 0$,
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \P\left(
-      \left|
-      \sup_{w \in \cW}
-      \big| \hat Z_n^T(w) \big|
-      - t
-      \right|
-      \leq 2\varepsilon
-      \Bigm\vert \bW_n
-    \right)
-    &\leq
-    8 \varepsilon
-    \left(
-      1 + \E\left[
-        \sup_{w \in \cW}
-        \big| \hat Z_n^T(w) \big|
-        \Bigm\vert \bW_n
-      \right]
-    \right)
-    \lesssim \varepsilon \sqrt{\log n}.
-  \end{align*}
-  %
-  Thus letting $\varepsilon \to 0$
-  shows that the conditional distribution function of
-  $\sup_{w \in \cW} \big|\hat Z_n^T(w)\big|$
-  is continuous,
-  and therefore all of its conditional quantiles exist.
-
-  \proofparagraph{validity of the confidence band}
-
-  Define the following (conditional) distribution functions.
-  %
-  \begin{align*}
-    F_Z(t \mid \bW_n)
-    &=
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat Z_n^T(w) \right|
-      \leq t
-      \Bigm\vert \bW_n
-    \right),
-    &F_T(t)
-    &=
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq t
-    \right),
-  \end{align*}
-  %
-  along with their well-defined right-quantile functions,
-  %
-  \begin{align*}
-    F_Z^{-1}(p \mid \bW_n)
-    &=
-    \sup
-    \big\{
-      t \in \R
-      \, : \,
-      F_Z(t \mid \bW_n)
-      = p
-    \big\},
-    &F_T^{-1}(p)
-    &=
-    \sup
-    \big\{
-      t \in \R
-      \, : \,
-      F_T(t)
-      = p
-    \big\}.
-  \end{align*}
-  %
-  Note that
-  $t \leq F_Z^{-1}(p \mid \bW_n)$
-  if and only if
-  $F_Z(t \mid \bW_n) \leq p$.
-  Take $\alpha \in (0,1)$ and
-  define the quantile
-  $\hat q_{1-\alpha} = F_Z^{-1}(1-\alpha \mid \bW_n)$,
-  so that
-  $F_Z(\hat q_{1-\alpha} \mid \bW_n) = 1-\alpha$.
-  By Lemma~\ref{lem:kernel_app_feasible_gaussian_approx},
-  %
-  \begin{align*}
-    \sup_{t \in \R}
-    \big|
-    F_Z(t \mid \bW_n) - F_T(t)
-    \big|
-    &=
-    o_\P(1).
-  \end{align*}
-  %
-  Thus by Lemma~\ref{lem:kernel_app_slow_convergence},
-  this can be replaced by
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big|
-      > \varepsilon_n
-    \right)
-    &\leq \varepsilon_n
-  \end{align*}
-  %
-  for some $\varepsilon_n \to 0$.
-  Therefore
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \hat T_n(w)
-      \right|
-      \leq
-      \hat q_{1-\alpha}
-    \right)
-    &=
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \hat T_n(w)
-      \right|
-      \leq
-      F_Z^{-1}(1-\alpha \mid \bW_n)
-    \right) \\
-    &=
-    \P\left(
-      F_Z\left(
-        \sup_{w \in \cW}
-        \left|
-        \hat T_n(w)
-        \right|
-        \Bigm\vert \bW_n
-      \right)
-      \leq
-      1 - \alpha
-    \right) \\
-    &\leq
-    \P\left(
-      F_T\left(
-        \sup_{w \in \cW}
-        \left|
-        \hat T_n(w)
-        \right|
-      \right)
-      \leq
-      1 - \alpha + \varepsilon_n
-    \right)
-    + \varepsilon_n
-    \leq 1 - \alpha + 3\varepsilon_n,
-  \end{align*}
-  %
-  where we used the fact that for any
-  real-valued random variable $X$ with distribution function $F$,
-  we have
-  $\big|\P\big(F(X) \leq t\big) - t\big| \leq \Delta$,
-  where $\Delta$ is the size of the
-  largest jump discontinuity in $F$.
-  By uniform integrability,
-  $\sup_{t \in \R} \big| F_Z(t) - F_T(t) \big| = o(\varepsilon_n)$.
-  Since $F_Z$ has no jumps,
-  we must have $\Delta \leq \varepsilon_n$ for $F_T$.
-  Finally, a lower bound is constructed in an analogous manner,
-  giving
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \left| \hat T_n(w) \right|
-      \leq
-      \hat q_{1-\alpha}
-    \right)
-    &\geq
-    1 - \alpha - 3\varepsilon_n.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_bias}]
-
-  Writing
-  $k_{i j} = k_h(W_{i j}^1, w)$,
-  $\psi_i = \psi(X_i^1)$,
-  $\hat\psi_i = \hat\psi(X_i^1)$,
-  and $\kappa_{i j} = \kappa(X_i^0, X_i^1, X_j^1)$,
-  %
-  \begin{align*}
-    \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
-    &=
-    \E\left[
-      \frac{2}{n(n-1)}
-      \sum_{i<j}
-      \hat \psi_i
-      \hat \psi_j
-      k_{i j}
-    \right] \\
-    &=
-    \frac{2}{n(n-1)(n-2)}
-    \sum_{i < j}
-    \sum_{r \notin \{i,j\}}
-    \E\left[
-      k_{i j}
-      \Big(
-        \psi_i
-        \psi_j
-        +\psi_i
-        \kappa_{r j}
-        +\psi_j
-        \kappa_{r i}
-      \Big)
-    \right]
-    + O\left(\frac{1}{n}\right) \\
-    &=
-    \E\left[
-      k_{i j}
-      \psi_i
-      \psi_j
-    \right]
-    + O\left(\frac{1}{n}\right)
-    =
-    \E\big[
-      \psi_i
-      \psi_j
-      \E\left[
-        k_h(W_{i j}^1, w)
-        \mid X_i^1, X_j^1
-      \right]
-    \big]
-    + O\left(\frac{1}{n}\right) \\
-    &=
-    \E\big[
-      \psi_i
-      \psi_j
-      f_{W \mid XX}^1(w \mid X_i^1, X_j^1)
-      + O_\P\left( h^{p \wedge \beta} \right)
-    \big]
-    + O\left(\frac{1}{n}\right) \\
-    &=
-    f_W^{1 \triangleright 0}(w)
-    + O\left( h^{p \wedge \beta} + \frac{1}{n}\right)
-  \end{align*}
-  %
-  uniformly in $w$, by the proof of
-  Theorem~\ref{thm:kernel_bias} and H{\"o}lder continuity
-  of $f_{W \mid XX}^1$.
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_hoeffding}]
-  %
-  \begin{align*}
-    \hat f_W^{1 \triangleright 0}(w)
-    &=
-    \frac{2}{n(n-1)}
-    \sum_{i < j}
-    \hat \psi_i
-    \hat \psi_j
-    k_{i j} \\
-    &=
-    \frac{2}{n(n-1)}
-    \sum_{i < j}
-    \left(
-      \psi_i
-      + \frac{1}{n}
-      \sum_{r=1}^n \kappa_{r i}
-    \right)
-    \left(
-      \psi_j
-      + \frac{1}{n}
-      \sum_{r=1}^n \kappa_{r j}
-    \right)
-    k_{i j}
-    + O_\P\left(\frac{1}{n}\right) \\
-    &=
-    \frac{2}{n(n-1)}
-    \sum_{i < j}
-    \psi_i
-    \psi_j
-    k_{i j}
-    + \frac{2}{n(n-1)}
-    \sum_{i < j}
-    \psi_i
-    \frac{1}{n}
-    \sum_{r \notin \{i,j\}}^n \kappa_{r j}
-    k_{i j} \\
-    &\quad+
-    \frac{2}{n(n-1)}
-    \sum_{i < j}
-    \psi_j
-    \frac{1}{n}
-    \sum_{r \notin \{i,j\}}^n \kappa_{r i}
-    k_{i j}
-    + O_\P\left(\frac{1}{n}\right) \\
-    &=
-    \frac{2}{n(n-1)(n-2)}
-    \sum_{i < j}
-    \sum_{r \notin \{i,j\}}
-    k_{i j}
-    \Big(
-      \psi_i
-      \psi_j
-      +\psi_i
-      \kappa_{r j}
-      +\psi_j
-      \kappa_{r i}
-    \Big)
-    + O_\P\left(\frac{1}{n}\right) \\
-    &=
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i < j < r}
-    v_{i j r}
-    + O_\P\left(\frac{1}{n}\right)
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    v_{i j r}
-    &=
-    \frac{1}{3}
-    k_{i j} \Big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \Big)
-    + \frac{1}{3}
-    k_{i r} \Big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \Big) \\
-    &\quad+
-    \frac{1}{3}
-    k_{jr} \Big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \Big)
-  \end{align*}
-  %
-  So by the Hoeffding decomposition for third-order U-statistics,
-  %
-  \begin{align*}
-    \hat f_W^{1 \triangleright 0}(w)
-    &=
-    u
-    + \frac{3}{n}
-    \sum_{i=1}^n
-    u_i
-    + \frac{6}{n(n-1)}
-    \sum_{i=1}^{n-1}
-    \sum_{j=i+1}^n
-    u_{i j}
-    + \frac{6}{n(n-1)(n-2)}
-    \sum_{i=1}^{n-2}
-    \sum_{j=i+1}^{n-1}
-    \sum_{r=j+1}^n
-    u_{i j r} \\
-    &\quad+
-    \frac{6}{n(n-1)(n-2)}
-    \sum_{i=1}^{n-2}
-    \sum_{j=i+1}^{n-1}
-    \sum_{r=j+1}^n
-    \big(v_{i j r} - u_{i j r}\big)
-    + O_\P\left( \frac{1}{n} \right) \\
-    &=
-    \E\big[\hat f_W^{1 \triangleright 0}(w) \big]
-    + L_n^{1 \triangleright 0}(w)
-    + Q_n^{1 \triangleright 0}(w)
-    + T_n^{1 \triangleright 0}(w)
-    + E_n^{1 \triangleright 0}(w)
-    + O_\P\left( \frac{1}{n} \right).
-  \end{align*}
-  %
-  Noting that $\psi_i$, $\kappa_{i j}$
-  and $\E[k_{i j} \mid A_i^1, A_j^1]$
-  are all bounded and that
-  $\E[k_{i j} \mid A_i^1, A_j^1]$
-  is Lipschitz in $w$,
-  we deduce by
-  Lemma~\ref{lem:kernel_app_uprocess_maximal}
-  and Proposition~2.3 of
-  \citet{arcones1993limit} that
-  $\sup_{w \in \cW} |Q_n^{1 \triangleright 0}(w)
-  + T_n^{1 \triangleright 0}(w)| \lesssim_\P \frac 1n$.
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_uniform_consistency}]
-
-  By Lemma~\ref{lem:kernel_app_maximal_vc_inid},
-  $\sup_{w \in \cW} \big|L_n^{1 \triangleright 0}(w)\big|
-  \lesssim_\P \frac{1}{\sqrt n}$.
-  In the proof of Lemma~\ref{lem:kernel_app_counterfactual_hoeffding}
-  the terms $v_{i j r} - u_{i j r}$ depend only on
-  $V_{i j}$, $V_{i r}$, and $V_{jr}$
-  after conditioning on $\bA_n^1$, $\bX_n^0$, and $\bX_n^1$.
-  Thus $E_n^{1 \triangleright 0}(w)$ is a degenerate second-order
-  U-statistic so
-  $\sup_{w \in \cW} \big|E_n^{1 \triangleright 0}(w)\big|
-  \lesssim_\P \sqrt{\frac{\log n}{n^2h}}$
-  by Lemma~\ref{lem:kernel_app_uprocess_maximal}.
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_sa}]
-
-  Note that
-  $L_n^{1 \triangleright 0}(w)
-  = \frac 3n \sum_{i=1}^n l_i^{1 \triangleright 0}(w)$
-  where $l_i^{1 \triangleright 0}(w)$ depends only on
-  $A_i^1$, $X_i^0$, and $X_i^1$.
-  Let $\gamma: \cX \times \cX \to \{1, \ldots, |\cX|^2\}$
-  be a bijection and
-  define $\logistic(x) = \frac{1}{1+e^{-x}}$.
-  Let
-  $\tilde A_i = \logistic(A_i^1) + \gamma(X_i^0, X_i^1)$
-  so that
-  $A_i^1 = \logistic^{-1}\big(\tilde A_i
-  - \lfloor \tilde A_i \rfloor\big)$
-  and
-  $(X_i^0, X_i^1) = \gamma^{-1}(\lfloor \tilde A_i \rfloor)$.
-  Thus
-  $l_i^{1 \triangleright 0}(w)$ is a bounded-variation function
-  of $\tilde A_i$, uniformly in $w$, and so as in
-  Lemma~\ref{lem:kernel_app_strong_approx_Ln} we have that
-  on an appropriately enlarged probability space,
-  %
-  \begin{align*}
-    \E\left[
-      \sup_{w \in \cW}
-      \left|
-      \sqrt n L_n^{1 \triangleright 0}(w)
-      - Z_n^{L, 1 \triangleright 0}(w)
-      \right|
-    \right]
-    \lesssim
-    \frac{\log n}{\sqrt n}
-  \end{align*}
-  %
-  where $Z_n^{L, 1 \triangleright 0}$ is a mean-zero
-  Gaussian process with the same covariance as
-  $\sqrt n L_n^{1 \triangleright 0}$.
-  For $E_n^{1 \triangleright 0}(w)$,
-  we first construct a strong approximation conditional on
-  $\bA_n$ and $\bX_n$ as shown in
-  Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}
-  and deduce an unconditional strong approximation as in
-  Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En} to see
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    \sqrt{n^2h} E_n^{1 \triangleright 0}(w)
-    - Z_n^{E, 1 \triangleright 0}(w)
-    \right|
-    \lesssim_\P
-    n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n
-    + n^{-1/6} (\log n)^{2/3}
-  \end{align*}
-  %
-  where $Z_n^{E, 1 \triangleright 0}$ is a mean-zero
-  Gaussian process with the same covariance as
-  $\sqrt{n^2h} E_n^{1 \triangleright 0}$.
-  Arguing as in the proof of Theorem~\ref{thm:kernel_app_strong_approx_fW}
-  shows that the Gaussian processes are independent
-  and can be summed to yield a single strong approximation.
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_covariance_structure}]
-
-  Arguing by mean-zero properties and conditional independence,
-  %
-  \begin{align*}
-    &\Sigma_n^{1 \triangleright 0}(w,w')
-    = \Cov\left[
-      \hat f_W^{1 \triangleright 0}(w),
-      \hat f_W^{1 \triangleright 0}(w')
-    \right] \\
-    &\quad=
-    \frac{1}{n^2(n-1)^2(n-2)^2}
-    \sum_{i \neq j}
-    \sum_{r \notin \{i,j\}}
-    \sum_{i' \neq j'}
-    \sum_{r' \notin \{i',j'\}}
-    \!\!\!\!\E\Big[
-      \!\Big(
-        k_{i j} \psi_i \psi_j
-        - \E[k_{i j} \psi_i \psi_j]
-        + k_{i j}\psi_i \kappa_{r j}
-        + k_{i j}\psi_j \kappa_{r i}
-      \Big) \\
-      &\qquad\qquad\times
-      \Big(
-        k_{i' j'}' \psi_{i'} \psi_{j'}
-        - \E[k_{i j}' \psi_i \psi_j]
-        + k_{i' j'}'\psi_{i'} \kappa_{r' j'}
-        + k_{i' j'}'\psi_{j'} \kappa_{r' i'}
-      \Big)
-    \Big]
-    + O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right) \\
-    &\quad=
-    \frac{2}{n^2}
-    \E\left[
-      k_{i j} \psi_i \psi_j
-      k_{i j}' \psi_i \psi_j
-    \right]
-    + \frac{4}{n}
-    \E\left[
-      k_{i j} \psi_i \psi_j
-      k_{i r}' \psi_i \psi_r
-    \right]
-    - \frac{4}{n}
-    \E\left[
-      k_{i j} \psi_i \psi_j
-    \right]
-    \E\left[
-      k_{i j}' \psi_i \psi_j
-    \right] \\
-    &\qquad+
-    \frac{4}{n}
-    \E\left[
-      k_{i j}\psi_i \kappa_{i' j}
-      k_{i' j'}' \psi_{i'} \psi_{j'}
-    \right]
-    + \frac{4}{n}
-    \E\left[
-      k_{i j} \psi_{i} \psi_{j}
-      k_{i' j'}'\psi_{i'} \kappa_{i j'}
-    \right]
-    + \frac{4}{n}
-    \E\left[
-      k_{i j} k'_{i' j'}
-      \psi_i \psi_{i'}
-      \kappa_{r j} \kappa_{r j'}
-    \right] \\
-    &\qquad+
-    O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right) \\
-    &\quad=
-    \frac{4}{n}
-    \E\left[
-      \Big(
-        \psi_i
-        \E\big[
-          k_{i j} \psi_j
-          \mid i
-        \big]
-        + \E\left[
-          k_{r j} \psi_r \kappa_{i j}
-          \mid i
-        \right]
-      \Big)
-      \Big(
-        \psi_i
-        \E\big[
-          k_{i j}' \psi_j
-          \mid i
-        \big]
-        + \E\left[
-          k_{r j}' \psi_r \kappa_{i j}
-          \mid i
-        \right]
-      \Big)
-    \right] \\
-    &\qquad+
-    \frac{2}{n^2}
-    \E\left[
-      k_{i j} k_{i j}'
-      \psi_i^2 \psi_j^2
-    \right]
-    - \frac{4}{n}
-    \E\left[
-      k_{i j} \psi_i \psi_j
-    \right]
-    \E\left[
-      k_{i j}' \psi_i \psi_j
-    \right]
-    + O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right),
-  \end{align*}
-  %
-  where all indices are distinct.
-  %
-\end{proof}
-
-\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_infeasible_t_statistic}]
-  The proof is exactly the same as the proof of
-  Theorem~\ref{thm:kernel_strong_approx_Tn}.
-\end{proof}
-
-\begin{proof}[Theorem~\ref{thm:kernel_app_counterfactual_infeasible_ucb}]
-  This proof proceeds in the same manner as the proof of
-  Theorem~\ref{thm:kernel_infeasible_ucb}.
-\end{proof}
-
-\chapter[Supplement to Yurinskii's Coupling for Martingales]%
-{Supplement to Yurinskii's \\ Coupling for Martingales}
-\label{app:yurinskii}
-
-\section{Proofs of main results}
-\label{sec:yurinskii_app_proofs}
-
-\subsection{Preliminary lemmas}
-
-We give a sequence of preliminary lemmas which are useful for establishing our
-main results. Firstly, we present a conditional version of Strassen's theorem
-for the $\ell^p$-norm \citep[Theorem~B.2]{chen2020jackknife}, stated for
-completeness as Lemma~\ref{lem:yurinskii_app_strassen}.
-
-\begin{lemma}[A conditional Strassen theorem for the
-  \texorpdfstring{$\ell^p$}{lp}-norm]%
-  \label{lem:yurinskii_app_strassen}
-  %
-  Let $(\Omega, \cH, \P)$ be a probability space supporting the $\R^d$-valued
-  variable $X$ for some $d \geq 1$. Let $\cH'$ be a countably generated
-  sub-$\sigma$-algebra of $\cH$ and suppose there is a $\Unif[0,1]$ random
-  variable on $(\Omega, \cH, \P)$, independent of the $\sigma$-algebra
-  generated by $X$ and $\cH'$. Take a regular conditional distribution
-  $F(\cdot \mid \cH')$ satisfying the following. Firstly, $F(A \mid \cH')$ is
-  an $\cH'$-measurable variable for all Borel sets $A \in \cB(\R^d)$.
-  Secondly, $F(\cdot \mid \cH')(\omega)$ is a Borel probability measure on
-  $\R^d$ for all $\omega \in \Omega$. Taking $\eta, \rho > 0$ and
-  $p \in [1, \infty]$, with $\E^*$ the outer expectation, if
-  %
-  \begin{align*}
-    \E^* \left[
-      \sup_{A \in \cB(\R^d)}
-      \Big\{
-        \P \big( X \in A \mid \cH' \big)
-        - F \big( A_p^\eta \mid \cH' \big)
-      \Big\}
-    \right]
-    \leq \rho,
-  \end{align*}
-  %
-  where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$
-  and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$,
-  then there exists an $\R^d$-valued random variable $Y$
-  with $Y \mid \cH' \sim F(\cdot \mid \cH')$
-  and $\P \left( \|X-Y\|_p > \eta \right) \leq \rho$.
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_strassen}]
-  By Theorem~B.2 in \citet{chen2020jackknife}, noting that the $\sigma$-algebra
-  generated by $Z$ is countably generated and using the metric induced by the
-  $\ell^p$-norm.
-\end{proof}
-
-Next, we present in Lemma~\ref{lem:yurinskii_app_smooth_approximation} an
-analytic result
-concerning the smooth approximation of Borel set indicator functions, similar
-to that given in \citet[Lemma~39]{belloni2019conditional}.
-
-\begin{lemma}[Smooth approximation of Borel indicator functions]%
-  \label{lem:yurinskii_app_smooth_approximation}
-  Let $A \subseteq \R^d$ be a Borel set and $Z \sim \cN(0, I_d)$.
-  For $\sigma, \eta > 0$ and $p \in [1, \infty]$, define
-  %
-  \begin{align*}
-    g_{A\eta}(x)
-    &=
-    \left( 1 - \frac{\|x-A^\eta\|_p}{\eta} \right) \vee 0
-    & &\text{and}
-    &f_{A\eta\sigma}(x)
-    &=
-    \E\big[g_{A\eta}(x + \sigma Z) \big].
-  \end{align*}
-  %
-  Then $f$ is infinitely differentiable
-  and with $\varepsilon = \P(\|Z\|_p > \eta / \sigma)$,
-  for all $k \geq 0$,
-  any multi-index $\kappa = (\kappa_1,\dots, \kappa_d)\in\N^d$,
-  and all $x,y \in \R^d$,
-  we have $|\partial^\kappa f_{A\eta\sigma}(x)| \leq
-  \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}$ and
-  %
-  \begin{align*}
-    &\Bigg|
-    f_{A\eta\sigma}(x+y) - \sum_{|\kappa| = 0}^k
-    \frac{1}{\kappa!}
-    \partial^\kappa f_{A\eta\sigma}(x)
-    y^\kappa
-    \Bigg|
-    \leq
-    \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}, \\
-    &(1 - \varepsilon) \I\big\{x \in A\big\}
-    \leq f_{A\eta\sigma}(x)
-    \leq \varepsilon + (1 - \varepsilon)
-    \I\big\{x \in A^{3\eta}\big\}.
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_smooth_approximation}]
-  Drop subscripts on $g_{A\eta}$ and $f_{A \eta \sigma}$.
-  By Taylor's theorem with Lagrange remainder, for $t \in [0,1]$,
-  %
-  \begin{align*}
-    \Bigg|
-    f(x + y)
-    - \sum_{|\kappa|=0}^{k}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} f(x)
-    y^\kappa
-    \Bigg|
-    \leq
-    \Bigg|
-    \sum_{|\kappa|=k}
-    \frac{y^\kappa}{\kappa!}
-    \big(
-      \partial^{\kappa} f(x + t y)
-      - \partial^{\kappa} f(x)
-    \big)
-    \Bigg|.
-  \end{align*}
-  %
-  Now with $\phi(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$,
-  %
-  \begin{align*}
-    f(x)
-    &=
-    \E\big[g(x + \sigma W) \big]
-    =
-    \int_{\R^d}
-    g(x + \sigma u)
-    \prod_{j=1}^{d}
-    \phi(u_j)
-    \diff u
-    =
-    \frac{1}{\sigma^d}
-    \int_{\R^d}
-    g(u)
-    \prod_{j=1}^{d}
-    \phi \left( \frac{u_j-x_j}{\sigma} \right)
-    \diff u
-  \end{align*}
-  %
-  and since the integrand is bounded, we exchange differentiation and
-  integration to compute
-  %
-  \begin{align}
-    \nonumber
-    \partial^\kappa
-    f(x)
-    &=
-    \frac{1}{\sigma^{d+|\kappa|}}
-    \int_{\R^d}
-    g(u)
-    \prod_{j=1}^{d}
-    \partial^{\kappa_j}
-    \phi \left( \frac{u_j-x_j}{\sigma} \right)
-    \diff u
-    = \left( \frac{-1}{\sigma} \right)^{|\kappa|}
-    \int_{\R^d}
-    g(x + \sigma u)
-    \prod_{j=1}^{d}
-    \partial^{\kappa_j}
-    \phi(u_j)
-    \diff u \\
-    \label{eq:yurinskii_app_smoothing_derivative}
-    &=
-    \left( \frac{-1}{\sigma} \right)^{|\kappa|}
-    \E \Bigg[
-      g(x + \sigma Z)
-      \prod_{j=1}^{d}
-      \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
-    \Bigg],
-  \end{align}
-  %
-  where $Z \sim \cN(0, I_d)$.
-  Recalling that $|g(x)| \leq 1$ and applying the Cauchy--Schwarz inequality,
-  %
-  \begin{align*}
-    \left|
-    \partial^\kappa
-    f(x)
-    \right|
-    &\leq
-    \frac{1}{\sigma^{|\kappa|}}
-    \prod_{j=1}^{d}
-    \E \left[
-      \left(
-        \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
-      \right)^2
-    \right]^{1/2}
-    \leq
-    \frac{1}{\sigma^{|\kappa|}}
-    \prod_{j=1}^{d}
-    \sqrt{\kappa_j!}
-    =
-    \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}},
-  \end{align*}
-  %
-  as the expected square of the Hermite polynomial of degree
-  $\kappa_j$ against the standard Gaussian measure is $\kappa_j!$. By the
-  reverse triangle inequality, $|g(x + t y) - g(x)| \leq t \|y\|_p / \eta$,
-  so by \eqref{eq:yurinskii_app_smoothing_derivative},
-  %
-  \begin{align*}
-    &\left|
-    \sum_{|\kappa|=k}
-    \frac{y^\kappa}{\kappa!}
-    \big(
-      \partial^{\kappa} f(x + t y)
-      - \partial^{\kappa} f(x)
-    \big)
-    \right| \\
-    &\quad=
-    \left|
-    \sum_{|\kappa|=k}
-    \frac{y^\kappa}{\kappa!}
-    \frac{1}{\sigma^{|\kappa|}}
-    \E \Bigg[
-      \big(
-        g(x + t y + \sigma Z)
-        - g(x + \sigma Z)
-      \big)
-      \prod_{j=1}^{d}
-      \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
-    \Bigg]
-    \right| \\
-    &\quad\leq
-    \frac{t \|y\|_p}{\sigma^k \eta}
-    \, \E \left[
-      \Bigg|
-      \sum_{|\kappa|=k}
-      \frac{y^\kappa}{\kappa!}
-      \prod_{j=1}^{d}
-      \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
-      \Bigg|
-    \right].
-  \end{align*}
-  %
-  Therefore, by the Cauchy--Schwarz inequality,
-  %
-  \begin{align*}
-    &\Bigg(
-      \sum_{|\kappa|=k}
-      \frac{y^\kappa}{\kappa!}
-      \big(
-        \partial^{\kappa} f(x + t y)
-        - \partial^{\kappa} f(x)
-      \big)
-    \Bigg)^2
-    \leq
-    \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2}
-    \, \E \left[
-      \Bigg(
-        \sum_{|\kappa|=k}
-        \frac{y^\kappa}{\kappa!}
-        \prod_{j=1}^{d}
-        \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)}
-      \Bigg)^2
-    \right] \\
-    &\quad=
-    \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2}
-    \sum_{|\kappa|=k}
-    \sum_{|\kappa'|=k}
-    \frac{y^{\kappa + \kappa'}}{\kappa! \kappa'!}
-    \prod_{j=1}^{d}
-    \, \E \left[
-      \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)}
-      \frac{\partial^{\kappa'_j} \phi(Z_j)}{\phi(Z_j)}
-    \right].
-  \end{align*}
-  %
-  Orthogonality of Hermite polynomials gives zero if
-  $\kappa_j \neq \kappa'_j$. By the multinomial theorem,
-  %
-  \begin{align*}
-    \left|
-    f(x + y)
-    - \sum_{|\kappa|=0}^{k}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} f(x)
-    y^\kappa
-    \right|
-    &\leq
-    \frac{\|y\|_p}{\sigma^k \eta}
-    \Bigg(
-      \sum_{|\kappa|=k}
-      \frac{y^{2 \kappa}}{\kappa!}
-    \Bigg)^{1/2}
-    \leq
-    \frac{\|y\|_p}{\sigma^k \eta \sqrt{k!}}
-    \Bigg(
-      \sum_{|\kappa|=k}
-      \frac{k!}{\kappa!}
-      y^{2 \kappa}
-    \Bigg)^{1/2} \\
-    &\leq
-    \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}.
-  \end{align*}
-  %
-  For the final result, since
-  $f(x) = \E \left[ g(x + \sigma Z) \right]$ and
-  $\I\big\{x \in A^\eta\big\}\leq g(x)\leq \I\big\{x \in A^{2\eta}\big\}$,
-  %
-  \begin{align*}
-    f(x)
-    &\leq
-    \P \left( x + \sigma Z \in A^{2 \eta} \right) \\
-    &\leq
-    \P \left( \|Z\|_p > \frac{\eta}{\sigma} \right)
-    + \I \left\{ x \in A^{3 \eta} \right\}
-    \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right)
-    = \varepsilon
-    + (1 - \varepsilon) \I \left\{ x \in A^{3 \eta} \right\}, \\
-    f(x)
-    &\geq
-    \P \left( x + \sigma Z \in A^{\eta} \right)
-    \geq
-    \I \left\{ x \in A \right\}
-    \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right)
-    = (1 - \varepsilon) \I \left\{ x \in A \right\}.
-  \end{align*}
-  %
-\end{proof}
-
-We provide a useful Gaussian inequality in
-Lemma~\ref{lem:yurinskii_app_gaussian_useful}
-which helps bound the $\beta_{\infty,k}$ moment terms appearing in several
-places throughout the analysis.
-
-\begin{lemma}[A useful Gaussian inequality]%
-  \label{lem:yurinskii_app_gaussian_useful}
-
-  Let $X \sim \cN(0, \Sigma)$
-  where $\sigma_j^2 = \Sigma_{j j} \leq \sigma^2$ for all $1 \leq j \leq d$.
-  Then
-  %
-  \begin{align*}
-    \E\left[
-      \|X\|_2^2
-      \|X\|_\infty
-    \right]
-    &\leq
-    4 \sigma \sqrt{\log 2d}
-    \,\sum_{j=1}^d \sigma_j^2
-    &&\text{and}
-    &\E\left[
-      \|X\|_2^3
-      \|X\|_\infty
-    \right]
-    &\leq
-    8 \sigma \sqrt{\log 2d}
-    \,\bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^{3/2}.
-  \end{align*}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_useful}]
-
-  By Cauchy--Schwarz, with $k \in \{2,3\}$, we have
-  $\E\left[\|X\|_2^{k} \|X\|_\infty \right]
-  \leq \E\big[\|X\|_2^{2k} \big]^{1/2} \E\big[\|X\|_\infty^2 \big]^{1/2}$.
-  For the first term, by H{\"o}lder's inequality and the even
-  moments of the normal distribution,
-  %
-  \begin{align*}
-    \E\big[\|X\|_2^4 \big]
-    &=
-    \E\Bigg[
-      \bigg(
-        \sum_{j=1}^d X_j^2
-      \bigg)^2
-    \Bigg]
-    =
-    \sum_{j=1}^d \sum_{k=1}^d
-    \E\big[
-      X_j^2 X_k^2
-    \big]
-    \leq
-    \bigg(
-      \sum_{j=1}^d
-      \E\big[X_j^4 \big]^{\frac{1}{2}}
-    \bigg)^2
-    =
-    3 \bigg(
-      \sum_{j=1}^d
-      \sigma_j^2
-    \bigg)^2, \\
-    \E\big[\|X\|_2^6 \big]
-    &=
-    \sum_{j=1}^d \sum_{k=1}^d \sum_{l=1}^d
-    \E\big[
-      X_j^2 X_k^2 X_l^2
-    \big]
-    \leq
-    \bigg(
-      \sum_{j=1}^d
-      \E\big[X_j^6 \big]^{\frac{1}{3}}
-    \bigg)^3
-    =
-    15 \bigg(
-      \sum_{j=1}^d
-      \sigma_j^2
-    \bigg)^3.
-  \end{align*}
-  %
-  For the second term, by Jensen's inequality and the $\chi^2$ moment
-  generating function,
-  %
-  \begin{align*}
-    \E\big[\|X\|_\infty^2 \big]
-    &=
-    \E\left[
-      \max_{1 \leq j \leq d}
-      X_j^2
-    \right]
-    \leq
-    4 \sigma^2
-    \log
-    \sum_{j=1}^d
-    \E\Big[
-      e^{X_j^2 / (4\sigma^2)}
-    \Big]
-    \leq
-    4 \sigma^2
-    \log
-    \sum_{j=1}^d
-    \sqrt{2}
-    \leq
-    4 \sigma^2
-    \log 2 d.
-  \end{align*}
-  %
-\end{proof}
-
-We provide an $\ell^p$-norm tail probability bound for Gaussian variables in
-Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, motivating the definition of the
-term
-$\phi_p(d)$.
-
-\begin{lemma}[Gaussian \texorpdfstring{$\ell^p$}{lp}-norm bound]%
-  \label{lem:yurinskii_app_gaussian_pnorm}
-  Let $X \sim \cN(0, \Sigma)$ where $\Sigma \in \R^{d \times d}$
-  is a positive semi-definite matrix. Then we have that
-  $\E\left[ \|X\|_p \right] \leq
-  \phi_p(d) \max_{1 \leq j \leq d} \sqrt{\Sigma_{j j}}$
-  with $\phi_p(d) = \sqrt{pd^{2/p} }$ for $p \in [1,\infty)$
-  and $\phi_\infty(d) = \sqrt{2\log 2d}$.
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}]
-
-  For $p \in [1, \infty)$,
-  as each $X_j$ is Gaussian, we have
-  $\big(\E\big[|X_j|^p\big]\big)^{1/p}
-  \leq \sqrt{p\, \E[X_j^2]}
-  = \sqrt{p \Sigma_{j j}}$.
-  So
-  %
-  \begin{align*}
-    \E\big[\|X\|_p\big]
-    &\leq
-    \Bigg(\sum_{j=1}^d \E \big[ |X_j|^p \big] \Bigg)^{1/p}
-    \leq \Bigg(\sum_{j=1}^d p^{p/2} \Sigma_{j j}^{p/2} \Bigg)^{1/p}
-    \leq \sqrt{p d^{2/p}}
-    \max_{1\leq j\leq d}
-    \sqrt{\Sigma_{j j}}
-  \end{align*}
-  %
-  by Jensen's inequality.
-  For $p=\infty$,
-  with $\sigma^2 = \max_j \Sigma_{j j}$,
-  for $t>0$,
-  %
-  \begin{align*}
-    \E\big[\|X\|_\infty \big]
-    &\leq
-    t
-    \log
-    \sum_{j=1}^d
-    \E\Big[
-      e^{|X_j| / t}
-    \Big]
-    \leq
-    t
-    \log
-    \sum_{j=1}^d
-    \E\Big[
-      2 e^{X_j / t}
-    \Big]
-    \leq t \log \Big(2 d e^{\sigma^2/(2t^2)}\Big)
-    \leq t \log 2 d + \frac{\sigma^2}{2t},
-  \end{align*}
-  %
-  again by Jensen's inequality.
-  Setting $t = \frac{\sigma}{\sqrt{2 \log 2d}}$ gives
-  $\E\big[\|X\|_\infty \big] \leq \sigma \sqrt{2 \log 2d}$.
-  %
-\end{proof}
-
-We give a Gaussian--Gaussian $\ell^p$-norm approximation
-as Lemma~\ref{lem:yurinskii_app_feasible_gaussian}, useful for
-ensuring approximations remain valid upon substituting
-an estimator for the true variance matrix.
-
-\begin{lemma}[Gaussian--Gaussian approximation in
-  \texorpdfstring{$\ell^p$}{lp}-norm]%
-  \label{lem:yurinskii_app_feasible_gaussian}
-
-  Let $\Sigma_1, \Sigma_2 \in \R^{d \times d}$ be positive semi-definite
-  and take $Z \sim \cN(0, I_d)$.
-  For $p \in [1, \infty]$ we have
-  %
-  \begin{align*}
-    \P\left(
-      \left\|
-      \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z
-      \right\|_p
-      > t
-    \right)
-    &\leq
-    2 d \exp \left(
-      \frac{-t^2}
-      {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2}
-    \right).
-  \end{align*}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_feasible_gaussian}]
-
-  Let $\Sigma \in \R^{d \times d}$ be positive semi-definite
-  and write $\sigma^2_j = \Sigma_{j j} $.
-  For $p \in [1, \infty)$ by a union bound and
-  Gaussian tail probabilities,
-  %
-  \begin{align*}
-    \P\left(\big\| \Sigma^{1/2} Z \big\|_p > t \right)
-    &=
-    \P\Bigg(
-      \sum_{j=1}^d
-      \left|
-      \left(
-        \Sigma^{1/2} Z
-      \right)_j
-      \right|^p
-    > t^p \Bigg)
-    \leq
-    \sum_{j=1}^d
-    \P\Bigg(
-      \left|
-      \left(
-        \Sigma^{1/2} Z
-      \right)_j
-      \right|^p
-      > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p}
-    \Bigg) \\
-    &=
-    \sum_{j=1}^d
-    \P\Bigg(
-      \left|
-      \sigma_j Z_j
-      \right|^p
-      > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p}
-    \Bigg)
-    =
-    \sum_{j=1}^d
-    \P\left(
-      \left| Z_j \right|
-      > \frac{t}{\|\sigma\|_p}
-    \right)
-    \leq
-    2 d \, \exp\left( \frac{-t^2}{2 \|\sigma\|_p^2} \right).
-  \end{align*}
-  %
-  The same result holds for $p = \infty$ since
-  %
-  \begin{align*}
-    \P\left(\big\| \Sigma^{1/2} Z \big\|_\infty > t \right)
-    &=
-    \P\left(
-      \max_{1 \leq j \leq d}
-      \left|
-      \left(
-        \Sigma^{1/2} Z
-      \right)_j
-      \right|
-    > t \right)
-    \leq
-    \sum_{j=1}^d
-    \P\left(
-      \left|
-      \left(
-        \Sigma^{1/2} Z
-      \right)_j
-      \right|
-      > t
-    \right) \\
-    &=
-    \sum_{j=1}^d
-    \P\left(
-      \left|
-      \sigma_j Z_j
-      \right|
-      > t
-    \right)
-    \leq
-    2 \sum_{j=1}^d
-    \exp\left( \frac{-t^2}{2 \sigma_j^2} \right)
-    \leq
-    2 d
-    \exp\left( \frac{-t^2}{2 \|\sigma\|_\infty^2} \right).
-  \end{align*}
-  %
-  Now we apply this to the matrix
-  $\Sigma = \big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$.
-  For $p \in [1, \infty)$,
-  %
-  \begin{align*}
-    \|\sigma\|_p^p
-    &=
-    \sum_{j=1}^d (\Sigma_{j j})^{p/2}
-    =
-    \sum_{j=1}^d
-    \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2}
-    \leq
-    d \max_{1 \leq j \leq d}
-    \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \\
-    &\leq
-    d \, \Big\|\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big\|_2^{p/2}
-    =
-    d \, \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^p
-  \end{align*}
-  %
-  Similarly, for $p = \infty$ we have
-  %
-  \begin{align*}
-    \|\sigma\|_\infty
-    &=
-    \max_{1 \leq j \leq d}
-    (\Sigma_{j j})^{1/2}
-    =
-    \max_{1 \leq j \leq d}
-    \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{1/2}
-    \leq
-    \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2.
-  \end{align*}
-  %
-  Thus for all $p \in [1, \infty]$ we have
-  $\|\sigma\|_p \leq
-  d^{1/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2$,
-  with $d^{1/\infty} = 1$. Hence
-  %
-  \begin{align*}
-    \P\left(
-      \left\|
-      \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z
-      \right\|_p
-      > t
-    \right)
-    &\leq
-    2 d \exp \left( \frac{-t^2}{2 \|\sigma\|_p^2} \right)
-    \leq
-    2 d \exp \left(
-      \frac{-t^2}
-      {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2}
-    \right).
-  \end{align*}
-  %
-\end{proof}
-
-We give a variance bound and an exponential inequality for $\alpha$-mixing
-variables.
-
-\begin{lemma}[Variance bounds for
-  \texorpdfstring{$\alpha$}{alpha}-mixing random variables]
-  \label{lem:yurinskii_app_variance_mixing}
-
-  Let $X_1, \ldots, X_n$ be
-  real-valued $\alpha$-mixing random
-  variables with mixing coefficients $\alpha(j)$.
-  Then
-  %
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      \label{it:yurinskii_app_variance_mixing_bounded}
-      If for constants $M_i$ we have
-      $|X_i| \leq M_i$ a.s.\ then
-      %
-      \begin{align*}
-        \Var\left[
-          \sum_{i=1}^n X_i
-        \right]
-        &\leq
-        4 \sum_{j=1}^\infty \alpha(j)
-        \sum_{i=1}^n M_i^2.
-      \end{align*}
-
-    \item
-      \label{it:yurinskii_app_variance_mixing_exponential}
-      If $\alpha(j) \leq e^{-2j / C_\alpha}$ then
-      for any $r>2$ there is a constant
-      $C_r$ depending only on $r$ with
-      %
-      \begin{align*}
-        \Var\left[
-          \sum_{i=1}^n X_i
-        \right]
-        &\leq
-        C_r C_\alpha
-        \sum_{i=1}^n
-        \E\big[|X_i|^r\big]^{2/r}.
-      \end{align*}
-  \end{enumerate}
-  %
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_variance_mixing}]
-
-  Define
-  $\alpha^{-1}(t) =
-  \inf\{j \in \N : \alpha(j) \leq t\}$
-  and $Q_i(t) = \inf\{s \in \R : \P(|X_i| > s) \leq t\}$.
-  By Corollary~1.1 in \citet{rio2017asymptotic}
-  and H{\"o}lder's inequality for $r > 2$,
-  %
-  \begin{align*}
-    \Var\left[
-      \sum_{i=1}^n X_i
-    \right]
-    &\leq
-    4 \sum_{i=1}^n
-    \int_0^1 \alpha^{-1}(t)
-    Q_i(t)^2 \diff{t} \\
-    &\leq
-    4 \sum_{i=1}^n
-    \left(
-      \int_0^1 \alpha^{-1}(t)^{\frac{r}{r-2}} \diff{t}
-    \right)^{\frac{r-2}{r}}
-    \left(
-      \int_0^1 |Q_i(t)|^r \diff{t}
-    \right)^{\frac{2}{r}}
-    \diff{t}.
-  \end{align*}
-  %
-  Now note that if $U \sim \Unif[0,1]$ then
-  $Q_i(U)$ has the same distribution as $X_i$.
-  Therefore
-  %
-  \begin{align*}
-    \Var\left[
-      \sum_{i=1}^n X_i
-    \right]
-    &\leq
-    4
-    \left(
-      \int_0^1 \alpha^{-1}(t)^{\frac r{r-2}} \diff{t}
-    \right)^{\frac{r-2}r}
-    \sum_{i=1}^n
-    \E[|X_i|^r]^{\frac 2 r}.
-  \end{align*}
-  %
-  If $\alpha(j) \leq e^{-2j/C_\alpha}$ then
-  $\alpha^{-1}(t) \leq \frac{-C_\alpha \log t}{2}$
-  so, for some constant
-  $C_r$ depending only on $r$,
-  %
-  \begin{align*}
-    \Var\left[
-      \sum_{i=1}^n X_i
-    \right]
-    \leq
-    2 C_\alpha
-    \left(
-      \int_0^1 (-\log t)^{\frac r{r-2}} \diff{t}
-    \right)^{\frac{r-2} r}
-    \sum_{i=1}^n
-    \E[|X_i|^r]^{\frac 2 r}
-    \leq
-    C_r C_\alpha
-    \sum_{i=1}^n
-    \E[|X_i|^r]^{\frac 2 r}.
-  \end{align*}
-  %
-  Alternatively, if for constants $M_i$ we have
-  $|X_i| \leq M_i$ a.s.\ then
-  %
-  \begin{align*}
-    \Var\left[
-      \sum_{i=1}^n X_i
-    \right]
-    &\leq
-    4 \int_0^1 \alpha^{-1}(t)
-    \diff{t}
-    \sum_{i=1}^n M_i^2
-    \leq
-    4 \sum_{j=1}^\infty \alpha(j)
-    \sum_{i=1}^n M_i^2.
-  \end{align*}
-  %
-\end{proof}
-
-\begin{lemma}[Exponential concentration inequalities for
-  \texorpdfstring{$\alpha$}{alpha}-mixing random variables]
-  \label{lem:yurinskii_app_exponential_mixing}
-
-  Let $X_1, \ldots, X_n$ be zero-mean real-valued
-  variables with $\alpha$-mixing coefficients
-  $\alpha(j) \leq e^{-2 j / C_\alpha}$.
-
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      \label{it:yurinskii_app_exponential_mixing_bounded}
-      Suppose $|X_i| \leq M$ a.s.\ for $1 \leq i \leq n$.
-      Then for all $t > 0$ there is a constant $C_1$ with
-      %
-      \begin{align*}
-        \P\left(
-          \left|
-          \sum_{i=1}^n
-          X_i
-          \right|
-          > C_1 M \big( \sqrt{n t}
-          + (\log n)(\log \log n) t \big)
-        \right)
-        &\leq
-        C_1 e^{-t}.
-      \end{align*}
-      %
-    \item
-      \label{it:yurinskii_app_exponential_mixing_bernstein}
-      If further $\sum_{j=1}^n |\Cov[X_i, X_j]| \leq \sigma^2$,
-      then for all $t > 0$ there is a constant $C_2$ with
-      %
-      \begin{align*}
-        \P\left(
-          \left|
-          \sum_{i=1}^n
-          X_i
-          \right|
-          \geq C_2 \big( (\sigma \sqrt n + M) \sqrt t
-          + M (\log n)^2 t \big)
-        \right)
-        &\leq
-        C_2 e^{-t}.
-      \end{align*}
-
-  \end{enumerate}
-
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_exponential_mixing}]
-
-  \begin{enumerate}[label=(\roman*)]
-
-    \item
-      By Theorem~1 in \citet{merlevede2009bernstein},
-      %
-      \begin{align*}
-        \P\left(
-          \left|
-          \sum_{i=1}^n
-          X_i
-          \right|
-          > t
-        \right)
-        &\leq
-        \exp\left(
-          -\frac{C_1 t^2}{n M^2 + Mt (\log n)(\log\log n)}
-        \right).
-      \end{align*}
-      %
-      Replace $t$ by
-      $M \sqrt{n t} + M (\log n)(\log \log n) t$.
-
-    \item
-      By Theorem~2 in \citet{merlevede2009bernstein},
-      %
-      \begin{align*}
-        \P\left(
-          \left|
-          \sum_{i=1}^n
-          X_i
-          \right|
-          > t
-        \right)
-        &\leq
-        \exp\left(
-          -\frac{C_2 t^2}{n\sigma^2 + M^2 + Mt (\log n)^2}
-        \right).
-      \end{align*}
-      %
-      Replace $t$ by
-      $\sigma \sqrt n \sqrt t + M \sqrt t + M (\log n)^2 t$.
-  \end{enumerate}
-  %
-\end{proof}
-
-\subsection{Main results}
-
-To establish Theorem~\ref{thm:yurinskii_sa_dependent}, we first
-give the analogous result
-for martingales as Lemma~\ref{lem:yurinskii_app_sa_martingale}. Our approach is
-similar to
-that used in modern versions of Yurinskii's coupling for independent data, as
-in Theorem~1 in \citet{lecam1988} and Theorem~10 in Chapter~10 of
-\citet{pollard2002user}. The proof of
-Lemma~\ref{lem:yurinskii_app_sa_martingale} relies on
-constructing a ``modified'' martingale, which is close to the original
-martingale, but which has an $\cH_0$-measurable terminal quadratic variation.
-
-\begin{lemma}[Strong approximation for vector-valued martingales]%
-  \label{lem:yurinskii_app_sa_martingale}
-
-  Let $X_1, \ldots, X_n$ be $\R^d$-valued
-  square-integrable random vectors
-  adapted to a countably generated
-  filtration $\cH_0, \ldots, \cH_n$.
-  Suppose that
-  $\E[X_i \mid \cH_{i-1}] = 0$ for all $1 \leq i \leq n$
-  and define $S = \sum_{i=1}^n X_i$.
-  Let $V_i = \Var[X_i \mid \cH_{i-1}]$ and
-  $\Omega = \sum_{i=1}^n V_i - \Sigma$
-  where $\Sigma$ is a positive semi-definite
-  $\cH_0$-measurable $d \times d$ random matrix.
-  For each $\eta > 0$ and $p \in [1,\infty]$
-  there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > 5\eta\big)
-    &\leq
-    \inf_{t>0}
-    \left\{
-      2 \P\big( \|Z\|_p > t \big)
-      + \min\left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4}
-        + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\} \\
-    \nonumber
-    &\quad+
-    \inf_{M \succeq 0}
-    \big\{ 2\gamma(M) + \delta_p(M,\eta)
-    + \varepsilon_p(M, \eta)\big\},
-  \end{align*}
-  %
-  where the second infimum is over all positive semi-definite
-  $d \times d$ non-random matrices, and
-  %
-  \begin{align*}
-    \beta_{p,k}
-    &=
-    \sum_{i=1}^n \E\left[\| X_i \|^k_2 \| X_i \|_p
-    + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right],
-    \qquad\gamma(M)
-    = \P\big(\Omega \npreceq M\big), \\
-    \delta_p(M,\eta)
-    &=
-    \P\left(
-      \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
-      \geq \eta
-    \right),
-    \qquad\pi_3
-    =
-    \sum_{i=1}^{n+m}
-    \sum_{|\kappa| = 3}
-    \E \Big[ \big|
-      \E \left[ X_i^\kappa \mid \cH_{i-1} \right]
-    \big| \Big], \\
-    \varepsilon_p(M, \eta)
-    &=
-    \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
-    \Omega \preceq M\right),
-  \end{align*}
-  %
-  for $k \in \{2,3\}$, with $Z, Z_1,\dots ,Z_n$ i.i.d.\ standard Gaussian
-  on $\R^d$ independent of $\cH_n$.
-\end{lemma}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_app_sa_martingale}]
-
-  \proofparagraph{constructing a modified martingale}
-
-  Take $M \succeq 0$ a fixed positive semi-definite
-  $d \times d$ matrix.
-  We start by constructing a new martingale based on $S$
-  whose quadratic variation is $\Sigma + M$.
-  Take $m \geq 1$ and define
-  %
-  \begin{align*}
-    H_k
-    &=
-    \Sigma
-    + M
-    - \sum_{i=1}^{k} V_i,
-    \qquad\qquad\qquad\qquad\tau
-    =
-    \sup \big\{ k\in\{0,1,\dots,n\} : H_k \succeq 0 \big\}, \\
-    \tilde X_i
-    &=
-    X_i\I\{i \leq \tau\}
-    + \frac{1}{\sqrt{m}} H_\tau^{1/2} Z_i\I\{n+1 \leq i \leq n+m\},
-    \qquad\qquad\tilde S
-    =
-    \sum_{i=1}^{n+m} \tilde X_i,
-  \end{align*}
-  %
-  where $Z_{n+1}, \ldots, Z_{n+m}$ is an i.i.d.\
-  sequence of standard Gaussian vectors in $\R^d$
-  independent of $\cH_n$,
-  noting that $H_0 = \Sigma + M \succeq 0$ a.s.
-  Define the filtration
-  $\tilde \cH_0, \ldots, \tilde \cH_{n+m}$,
-  where $\tilde \cH_i = \cH_i$ for $0 \leq i \leq n$
-  and is the $\sigma$-algebra generated by
-  $\cH_n$ and $Z_{n+1}, \dots, Z_{i}$ for $n+1 \leq i\leq n+m$.
-  Observe that $\tau$ is a stopping time with respect to $\tilde\cH_i$
-  because $H_{i+1} - H_i = -V_{i+1} \preceq 0$ almost surely,
-  so $\{\tau \leq i\} = \{H_{i+1} \nsucceq 0\}$ for $0\leq i<n$.
-  This depends only on $V_1, \dots, V_{i+1}$ and $\Sigma$
-  which are $\tilde\cH_i$-measurable.
-  Similarly, $\{\tau = n\} = \{H_n \succeq 0\} \in \tilde\cH_{n-1}$.
-  Let $\tilde V_i = V_i \I\{i\leq\tau\}$ for
-  $1\leq i\leq n$ and
-  $\tilde V_i = H_\tau/m$ for $n+1\leq i\leq n+m$.
-  Note that $\tilde X_i$ is $\tilde \cH_i$-measurable
-  and $\tilde V_i$ is $\tilde \cH_{i-1}$-measurable.
-  Further, $\E \left[ \tilde X_i \mid \tilde \cH_{i-1} \right] = 0$ and
-  $\E \left[ \tilde X_i \tilde X_i^\T \mid \tilde \cH_{i-1} \right]
-  = \tilde V_i$.
-
-  \proofparagraph{bounding the difference between the original and
-  modified martingales}
-
-  By the triangle inequality,
-  %
-  \begin{align*}
-    \|S - \tilde S \|_p
-    &\leq
-    \left\| \sum_{i=\tau+1}^n X_i \right\|_p
-    + \left\| \frac{1}{\sqrt{m}} \sum_{i=n+1}^m H_\tau^{1/2} Z_i \right\|_p.
-  \end{align*}
-  %
-  The first term on the right vanishes on
-  $\{\tau = n\} = \{H_n \succeq 0\} = \{\Omega \preceq M\}$.
-  For the second term, note that
-  $\tfrac{1}{\sqrt{m}}\sum_{i=n+1}^m H_\tau^{1/2} Z_i$
-  is distributed as $H_\tau^{1/2}Z$,
-  where $Z$ is an independent standard Gaussian variable.
-  Also
-  $\P\big( \| H_\tau^{1/2} Z \|_p > \eta \big)
-  \leq \P\big( \| H_n^{1/2} Z \|_p > \eta,\, \Omega \preceq M)
-  + \P\big( \Omega \npreceq M \big)$,
-  so
-  %
-  \begin{align*}%
-    \label{eq:yurinskii_app_approx_modified_original}
-    \P\big( \| S - \tilde S \|_p > \eta\big)
-    &\leq
-    2 \P\big(\Omega \npreceq M \big)
-    + \P\big( \| (M-\Omega)^{1/2}Z \|_p > \eta,\,
-    \Omega \preceq M \big)
-    = 2 \gamma(M) + \varepsilon_p(M, \eta).
-  \end{align*}
-
-  \proofparagraph{strong approximation of the modified martingale}
-
-  Let $\tilde Z_1, \ldots, \tilde Z_{n+m}$ be i.i.d.\ $\cN(0, I_d)$
-  and independent of $\tilde \cH_{n+m}$.
-  Define $\check X_i = \tilde V_i^{1/2} \tilde Z_i$
-  and $\check S = \sum_{i=1}^{n+m} \check X_i$.
-  Fix a Borel set $A \subseteq \R^d$ and $\sigma, \eta > 0$ and
-  let $f = f_{A\eta\sigma}$ be the function defined in
-  Lemma~\ref{lem:yurinskii_app_smooth_approximation}.
-  By the Lindeberg method, write the telescoping sum
-  %
-  \begin{align*}
-    \E\Big[f\big(\tilde S\big) - f\big(\check S\big)
-    \mid \cH_0 \Big]
-    &=
-    \sum_{i=1}^{n+m}
-    \E\Big[ f\big(Y_i + \tilde X_i\big)
-      - f\big(Y_i + \check X_i\big)
-    \mid \cH_0 \Big]
-  \end{align*}
-  %
-  where
-  $Y_i = \sum_{j=1}^{i-1} \tilde X_j + \sum_{j=i+1}^{n+m} \check X_j$.
-  By Lemma~\ref{lem:yurinskii_app_smooth_approximation} we have for $k \geq 0$
-  %
-  \begin{align*}
-    &\Bigg|
-    \E\big[
-      f(Y_i + \tilde X_i)
-      - f(Y_i + \check X_i)
-      \mid \cH_0
-    \big]
-    - \sum_{|\kappa| = 0}^k
-    \frac{1}{\kappa!}
-    \E \left[
-      \partial^\kappa f(Y_i)
-      \left( \tilde X_i^\kappa - \check X_i^\kappa \right)
-      \bigm| \cH_0
-    \right]
-    \Bigg| \\
-    &\quad\leq
-    \frac{1}{\sigma^k \eta \sqrt{k!}}
-    \E \left[
-      \|\tilde X_i\|_p \|\tilde X_i\|_2^k
-      + \|\check X_i\|_p \|\check X_i\|_2^k
-      \bigm| \cH_0
-    \right].
-  \end{align*}
-  %
-  With $k \in \{2, 3\}$, we bound each summand.
-  With $|\kappa| = 0$ we have
-  $\tilde X_i^\kappa = \check X_i^\kappa$,
-  so consider $|\kappa| = 1$.
-  Noting that $\sum_{i=1}^{n+m} \tilde V_i = \Sigma + M$, define
-  %
-  \begin{align*}
-    \tilde Y_i
-    &=
-    \sum_{j=1}^{i-1} \tilde X_j
-    + \tilde Z_i
-    \Bigg(\sum_{j=i+1}^{n+m} \tilde V_j\Bigg)^{1/2}
-    =
-    \sum_{j=1}^{i-1} \tilde X_j
-    + \tilde Z_i
-    \Bigg(\Sigma + M - \sum_{j=1}^{i} \tilde V_j\Bigg)^{1/2}
-  \end{align*}
-  %
-  and let $\check \cH_i$ be the $\sigma$-algebra generated by
-  $\tilde \cH_{i-1}$ and $\tilde Z_i$.
-  Note that $\tilde Y_i$ is $\check \cH_i$-measurable
-  and that $Y_i$ and $\tilde Y_i$
-  have the same distribution conditional on $\tilde \cH_{n+m}$. So
-  %
-  \begin{align*}
-    &\sum_{|\kappa| = 1}
-    \frac{1}{\kappa!}
-    \E\left[
-      \partial^\kappa f(Y_i)
-      \big( \tilde X_i^\kappa - \check X_i^\kappa \big)
-      \bigm| \cH_0
-    \right]
-    = \E \left[
-      \nabla f(Y_i)^\T
-      \big( \tilde X_i - \tilde V_i^{1/2} \tilde Z_i \big)
-      \bigm| \cH_0
-    \right] \\
-    &\quad=
-    \E \left[
-      \nabla f(\tilde Y_i)^\T \tilde X_i
-      \bigm| \cH_0
-    \right]
-    - \E \left[
-      \nabla f(Y_i)^\T \tilde V_i^{1/2} \tilde Z_i
-      \bigm| \cH_0
-    \right] \\
-    &\quad=
-    \E \left[
-      \nabla f(\tilde Y_i)^\T
-      \E \left[
-        \tilde X_i
-        \mid \check \cH_i
-      \right]
-      \bigm| \cH_0
-    \right]
-    - \E \left[
-      \tilde Z_i
-    \right]
-    \E \left[
-      \nabla f(Y_i)^\T \tilde V_i^{1/2}
-      \bigm| \cH_0
-    \right] \\
-    &\quad=
-    \E \left[
-      \nabla f(\tilde Y_i)^\T
-      \E \left[
-        \tilde X_i
-        \mid \tilde \cH_{i-1}
-      \right]
-      \bigm| \cH_0
-    \right]
-    - 0
-    = 0.
-  \end{align*}
-  %
-  Next, if $|\kappa| = 2$ then
-  %
-  \begin{align*}
-    &\sum_{|\kappa| = 2}
-    \frac{1}{\kappa!}
-    \E \left[
-      \partial^\kappa f(Y_i)
-      \left( \tilde X_i^\kappa - \check X_i^\kappa \right)
-      \bigm| \cH_0
-    \right] \\
-    &\quad=
-    \frac{1}{2}
-    \E \left[
-      \tilde X_i^\T \nabla^2 f(Y_i) \tilde X_i
-      - \tilde Z_i^\T \tilde V_i^{1/2} \nabla^2 f(Y_i)
-      \tilde V_i^{1/2} \tilde Z_i
-      \bigm| \cH_0
-    \right] \\
-    &\quad=
-    \frac{1}{2}
-    \E \left[
-      \E \left[
-        \Tr \nabla^2 f(\tilde Y_i) \tilde X_i \tilde X_i^\T
-        \bigm| \check \cH_i
-      \right]
-      \bigm| \cH_0
-    \right]
-    - \frac{1}{2}
-    \E \left[
-      \Tr \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2}
-      \bigm| \cH_0
-    \right]
-    \E \left[
-      \tilde Z_i \tilde Z_i^\T
-    \right] \\
-    &\quad=
-    \frac{1}{2}
-    \E \left[
-      \Tr \nabla^2 f(Y_i)
-      \E \left[
-        \tilde X_i \tilde X_i^\T
-        \bigm| \tilde \cH_{i-1}
-      \right]
-      \bigm| \cH_0
-    \right]
-    - \frac{1}{2}
-    \E \left[
-      \Tr \nabla^2 f(Y_i) \tilde V_i
-      \bigm| \cH_0
-    \right]
-    = 0.
-  \end{align*}
-  %
-  Finally, if $|\kappa| = 3$, then since
-  $\check X_i \sim \cN(0, \tilde V_i)$
-  conditional on $\tilde \cH_{n+m}$, we have by symmetry of the Gaussian
-  distribution and Lemma~\ref{lem:yurinskii_app_smooth_approximation},
-  %
-  \begin{align*}
-    &
-    \left|
-    \sum_{|\kappa| = 3}
-    \frac{1}{\kappa!}
-    \E \left[
-      \partial^\kappa f(Y_i)
-      \left( \tilde X_i^\kappa - \check X_i^\kappa \right)
-      \bigm| \cH_0
-    \right]
-    \right|
-    \\
-    &\quad=
-    \left|
-    \sum_{|\kappa| = 3}
-    \frac{1}{\kappa!}
-    \left(
-      \E \left[
-        \partial^\kappa f(\tilde Y_i)
-        \E \left[ \tilde X_i^\kappa \mid \check \cH_i \right]
-        \bigm| \cH_0
-      \right]
-      - \E \left[
-        \partial^\kappa f(Y_i) \,
-        \E \left[
-          \check X_i^\kappa
-          \bigm| \tilde \cH_{n+m}
-        \right]
-        \bigm| \cH_0
-      \right]
-    \right)
-    \right|
-    \\
-    &\quad=
-    \left|
-    \sum_{|\kappa| = 3}
-    \frac{1}{\kappa!}
-    \E \left[
-      \partial^\kappa f(Y_i) \,
-      \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
-      \bigm| \cH_0
-    \right]
-    \right|
-    \leq
-    \frac{1}{\sigma^3}
-    \sum_{|\kappa| = 3}
-    \E \left[
-      \left|
-      \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
-      \right|
-      \bigm| \cH_0
-    \right].
-  \end{align*}
-  %
-  Combining these and summing over $i$ with $k=2$ shows
-  %
-  \begin{align*}
-    \E\left[
-      f\big(\tilde S\big) - f\big(\check S\big)
-      \bigm| \cH_0
-    \right]
-    &\leq
-    \frac{1}{\sigma^2 \eta \sqrt{2}}
-    \sum_{i=1}^{n+m}
-    \E \left[
-      \|\tilde X_i\|_p \|\tilde X_i\|_2^2
-      + \|\check X_i\|_p \|\check X_i\|_2^2
-      \bigm| \cH_0
-    \right]
-  \end{align*}
-  %
-  On the other hand, taking $k = 3$ gives
-  %
-  \begin{align*}
-    \E\left[
-      f\big(\tilde S\big) - f\big(\check S\big)
-      \bigm| \cH_0
-    \right]
-    &\leq
-    \frac{1}{\sigma^3 \eta \sqrt{6}}
-    \sum_{i=1}^{n+m}
-    \E \left[
-      \|\tilde X_i\|_p \|\tilde X_i\|_2^3
-      + \|\check X_i\|_p \|\check X_i\|_2^3
-      \bigm| \cH_0
-    \right] \\
-    &\quad+
-    \frac{1}{\sigma^3}
-    \sum_{i=1}^{n+m}
-    \sum_{|\kappa| = 3}
-    \E \left[
-      \left|
-      \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
-      \right|
-      \bigm| \cH_0
-    \right].
-  \end{align*}
-  %
-  For $1 \leq i \leq n$ we have
-  $\|\tilde X_i\| \leq \|X_i\|$
-  and $\|\check X_i\| \leq \|V_i^{1/2} \tilde Z_i\|$.
-  For $n+1 \leq i \leq n+m$ we have
-  $\tilde X_i = H_\tau^{1/2} Z_i / \sqrt m$
-  and $\check X_i = H_\tau^{1/2} \tilde Z_i / \sqrt m$
-  which are equal in distribution given $\cH_0$.
-  So with
-  %
-  \begin{align*}
-    \tilde \beta_{p,k}
-    &=
-    \sum_{i=1}^{n}
-    \E \left[
-      \|X_i\|_p \|X_i\|_2^k
-      + \|V_i^{1/2} Z_i\|_p \|V_i^{1/2} Z_i\|_2^k
-      \bigm| \cH_0
-    \right],
-  \end{align*}
-  %
-  we have, since $k \in \{2,3\}$,
-  %
-  \begin{align*}
-    &\sum_{i=1}^{n+m}
-    \E \left[
-      \|\tilde X_i\|_p \|\tilde X_i\|_2^k
-      + \|\check X_i\|_p \|\check X_i\|_2^k
-      \bigm| \cH_0
-    \right]
-    \leq
-    \tilde\beta_{p,k}
-    + \frac{2}{\sqrt m}
-    \E \left[
-      \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k
-      \bigm| \cH_0
-    \right].
-  \end{align*}
-  %
-  Since $H_i$ is weakly decreasing under the
-  semi-definite partial order, we have
-  $H_\tau \preceq H_0 = \Sigma + M$
-  implying that $|(H_\tau)_{j j}| \leq \|\Sigma + M\|_{\max}$ and
-  $\E\big[|(H_\tau^{1/2} Z)_j|^3 \mid \cH_0 \big]
-  \leq \sqrt{8/\pi}\, \|\Sigma + M\|_{\max}^{3/2}$.
-  Hence as $p \geq 1$ and $k \in \{2,3\}$,
-  %
-  \begin{align*}
-    \E\left[
-      \|H_\tau^{1/2}Z\|_p
-      \|H_\tau^{1/2}Z\|_2^k
-      \bigm| \cH_0
-    \right]
-    &\leq
-    \E\left[\|H_\tau^{1/2} Z\|_1^{k+1}
-      \bigm| \cH_0
-    \right]
-    \leq
-    d^{k+1} \max_{1\leq j\leq d}
-    \E\left[|(H_\tau^{1/2} Z)_j|^{k+1}
-      \bigm| \cH_0
-    \right] \\
-    &\leq 3 d^4 \,
-    \|\Sigma + M\|_{\max}^{(k+1)/2}
-    \leq 6 d^4 \,
-    \|\Sigma \|_{\max}^{(k+1)/2}
-    + 6 d^4 \|M\|.
-  \end{align*}
-  %
-  Assuming some $X_i$ is not identically zero so
-  the result is non-trivial,
-  and supposing that $\Sigma$ is bounded a.s.\
-  (replacing $\Sigma$ by $\Sigma \cdot \I\{\|\Sigma\|_{\max} \leq C\}$
-  for an appropriately large $C$ if necessary),
-  take $m$ large enough that
-  %
-  \begin{align}
-    \label{eq:yurinskii_app_bound_extra_terms}
-    \frac{2}{\sqrt m}
-    \E \left[
-      \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k
-      \bigm| \cH_0
-    \right]
-    \leq
-    \frac{1}{4}
-    \beta_{p,k}.
-  \end{align}
-  %
-  Further, if $|\kappa| = 3$ then
-  $\big|\E \big[
-  \tilde X_i^\kappa \mid \tilde \cH_{i-1} \big]\big|
-  \leq \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right]\big|$
-  for $1 \leq i \leq n$
-  while by symmetry of the Gaussian distribution
-  $\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] = 0$
-  for $n+1 \leq i \leq n+m$.
-  Hence with
-  %
-  \begin{align*}
-    \tilde \pi_3
-    &=
-    \sum_{i=1}^{n+m}
-    \sum_{|\kappa| = 3}
-    \E \Big[ \big|
-      \E \left[ X_i^\kappa \mid \cH_{i-1} \right]
-    \big| \mid \cH_0 \Big],
-  \end{align*}
-  %
-  we have
-  %
-  \begin{align*}
-    \E\left[
-      f\big(\tilde S\big) - f\big(\check S\big)
-      \bigm| \cH_0
-    \right]
-    &\leq
-    \min \left\{
-      \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta}
-      + \frac{\beta_{p,2}}{4 \sigma^2 \eta},
-      \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta}
-      + \frac{\beta_{p,3}}{4 \sigma^3 \eta}
-      + \frac{\tilde \pi_3}{\sigma^3}
-    \right\}.
-  \end{align*}
-  %
-  Along with Lemma~\ref{lem:yurinskii_app_smooth_approximation}, and with
-  $\sigma = \eta / t$ and $\varepsilon = \P(\|Z\|_p > t)$,
-  we conclude that
-  %
-  \begin{align*}
-    &\P(\tilde S \in A \mid \cH_0)
-    =
-    \E\big[\I\{\tilde S \in A\} - f(\tilde S)
-      \mid \cH_0
-    \big]
-    + \E\big[f(\tilde S) - f\big(\check S\big)
-      \mid \cH_0
-    \big]
-    + \E \big[f\big(\check S\big)
-      \mid \cH_0
-    \big] \\
-    &\,\leq
-    \varepsilon\P(\tilde S \in A
-    \mid \cH_0)
-    + \min \! \left\{
-      \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta}
-      + \frac{\beta_{p,2}}{4 \sigma^2 \eta},
-      \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta}
-      + \frac{\beta_{p,3}}{4 \sigma^3 \eta}
-      + \frac{\tilde \pi_3}{\sigma^3}
-    \right\}
-    +
-    \varepsilon
-    + (1 - \varepsilon) \P\big(\check S \in A_p^{3\eta}
-      \mid \cH_0
-    \big) \\
-    &\,\leq
-    \P\big( \check S \in A_p^{3\eta}
-      \mid \cH_0
-    \big)
-    + 2 \P(\|Z\|_p > t)
-    + \min\!\left\{
-      \frac{3 \tilde \beta_{p,2} t^2}{4 \eta^3}
-      + \frac{\beta_{p,2} t^2}{4 \eta^3},
-      \frac{3 \tilde \beta_{p,3} t^3}{4 \eta^4}
-      + \frac{\beta_{p,3} t^3}{4 \eta^4}
-      + \frac{\tilde \pi_3 t^3}{\eta^3}
-    \right\}.
-  \end{align*}
-  %
-  Taking a supremum and an outer expectation yields
-  with $\beta_{p,k} = \E\big[\tilde \beta_{p,k}\big]$
-  and $\pi_3 = \E[\tilde \pi_3]$,
-  %
-  \begin{align*}
-    &\E^* \left[
-      \sup_{A \in \cB(\R^d)}
-      \left\{
-        \P(\tilde S \in A \mid \cH_0)
-        - \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big)
-      \right\}
-    \right] \\
-    &\quad\leq
-    2 \P(\|Z\|_p > t)
-    + \min \left\{
-      \frac{\beta_{p,2} t^2}{\eta^3},
-      \frac{\beta_{p,3} t^3}{\eta^4}
-      + \frac{\pi_3 t^3}{\eta^3}
-    \right\}.
-  \end{align*}
-  %
-  Finally, since
-  $\check S = \sum_{i=1}^n \tilde V_i^{1/2} \tilde Z_i
-  \sim \cN(0,\Sigma + M)$ conditional on $\cH_0$,
-  the conditional Strassen theorem
-  in Lemma~\ref{lem:yurinskii_app_strassen}
-  ensures the existence of $\tilde S$ and
-  $\tilde T \mid \cH_0 \sim \cN(0, \Sigma + M)$
-  such that
-  %
-  \begin{align}
-    \label{eq:yurinskii_app_approx_modified_martingale}
-    \P\left(\|\tilde S-\tilde T\|_p>3\eta\right)
-    &\leq
-    \inf_{t>0}
-    \left\{
-      2 \P(\|Z\|_p > t)
-      + \min \left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\},
-  \end{align}
-  %
-  since the infimum is attained by continuity of $\|Z\|_p$.
-
-  \proofparagraph{conclusion}
-
-  We show how to write
-  $\tilde T = (\Sigma + M)^{1/2} W$
-  where $W \sim \cN(0,I_d)$
-  and use this representation to construct
-  $T \mid \cH_0 \sim \cN(0, \Sigma)$.
-  By the spectral theorem, let $\Sigma + M = U \Lambda U^\T$
-  where $U$ is a $d \times d$ orthogonal random matrix
-  and $\Lambda$ is a diagonal $d \times d$ random matrix with
-  diagonal entries satisfying
-  $\lambda_1 \geq \cdots \geq \lambda_r > 0$
-  and $\lambda_{r+1} = \cdots = \lambda_d = 0$
-  where $r = \rank (\Sigma + M)$.
-  Let $\Lambda^+$ be the Moore--Penrose pseudo-inverse of $\Lambda$
-  (obtained by inverting its non-zero elements) and define
-  $W = U (\Lambda^+)^{1/2} U^\T \tilde T + U \tilde W$, where
-  the first $r$ elements of $\tilde W$ are zero
-  and the last $d-r$ elements are i.i.d.\ $\cN(0,1)$
-  independent from $\tilde T$.
-  Then, it is easy to check that
-  $W \sim \cN(0, I_d)$ and that
-  $\tilde T = (\Sigma + M)^{1/2} W$.
-  Now define $T = \Sigma^{1/2} W$ so
-  %
-  \begin{equation}%
-    \label{eq:yurinskii_app_approx_target}
-    \P\big(\|T - \tilde T\|_p > \eta\big)
-    = \P\big(\big\|\big((\Sigma + M)^{1/2}
-    - \Sigma^{1/2} \big) W \big\|_p>\eta \big)
-    = \delta_p(M, \eta).
-  \end{equation}
-  %
-  Finally
-  \eqref{eq:yurinskii_app_approx_modified_original},
-  \eqref{eq:yurinskii_app_approx_modified_martingale},
-  \eqref{eq:yurinskii_app_approx_target},
-  the triangle inequality,
-  and a union bound conclude the proof since
-  by taking an infimum over $M \succeq 0$,
-  and by possibly reducing the constant of $1/4$ in
-  \eqref{eq:yurinskii_app_bound_extra_terms} to account for
-  this infimum being potentially unattainable,
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > 5\eta\big)
-    &\leq
-    \P\big(\|\tilde S - \tilde T \|_p > 3\eta \big)
-    +\P\big(\|S - \tilde S \|_p > \eta\big)
-    +\P\big(\|T - \tilde T \|_p > \eta\big) \\
-    &\leq
-    \inf_{t>0}
-    \left\{
-      2 \P\big( \|Z\|_p > t \big)
-      + \min\left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4}
-        + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\} \\
-    &\quad+
-    \inf_{M \succeq 0}
-    \big\{ 2\gamma(M) + \delta_p(M,\eta)
-    + \varepsilon_p(M, \eta)\big\}.
-  \end{align*}
-  %
-\end{proof}
-
-Lemma~\ref{lem:yurinskii_app_sa_martingale} and the martingale approximation
-immediately yield Theorem~\ref{thm:yurinskii_sa_dependent}.
-
-\begin{proof}[Theorem~\ref{thm:yurinskii_sa_dependent}]
-  Apply Lemma~\ref{lem:yurinskii_app_sa_martingale} to
-  the martingale $\sum_{i=1}^{n} \tilde X_i$,
-  noting that $S - \sum_{i=1}^{n} \tilde X_i = U$.
-\end{proof}
-
-Bounding the quantities
-in Theorem~\ref{thm:yurinskii_sa_dependent} gives a
-user-friendly version as Proposition~\ref{pro:yurinskii_sa_simplified}.
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_sa_simplified}]
-
-  Set $M = \nu^2 I_d$ and
-  bound the terms appearing
-  the main inequality in Proposition~\ref{pro:yurinskii_sa_simplified}.
-
-  \proofparagraph{bounding $\P( \|Z\|_p > t )$}
-
-  By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
-  we have
-  $\P( \|Z\|_p > t ) \leq \E[\|Z\|_p] / t \leq \phi_p(d) / t$.
-
-  \proofparagraph{bounding $\gamma(M)$}
-
-  With $M = \nu^2 I_d$,
-  by Markov's inequality,
-  $\gamma(M) = \P\big(\Omega \npreceq M\big)
-  = \P\big(\|\Omega\|_2 > \nu^2 \big)
-  \leq \nu^{-2} \E[\|\Omega\|_2]$.
-
-  \proofparagraph{bounding $\delta(M, \eta)$}
-
-  By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
-  using
-  $\max_j |M_{j j}| \leq \|M\|_2$
-  for $M \succeq 0$,
-  %
-  \begin{align*}
-    \delta_{p}(M,\eta)
-    &= \P\left(
-      \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
-      \geq \eta
-    \right)
-    \leq \frac{\phi_p(d)} {\eta}
-    \E \left[
-      \big\|(\Sigma +M)^{1/2}- \Sigma^{1/2}\big\|_2
-    \right].
-  \end{align*}
-  %
-  For semi-definite matrices
-  the eigenvalue operator commutes with smooth matrix functions so
-  %
-  \begin{align*}
-    \|(\Sigma +M)^{1/2}- \Sigma^{1/2}\|_2
-    &=
-    \max_{1 \leq j \leq d}
-    \left|
-    \sqrt{\lambda_j(\Sigma) + \nu^2} - \sqrt{\lambda_j(\Sigma)}
-    \right|
-    \leq \nu
-  \end{align*}
-  %
-  and hence $\delta_{p}(M,\eta) \leq \phi_p(d)\nu / \eta$.
-
-  \proofparagraph{bounding $\varepsilon(M, \eta)$}
-
-  Note that $(M -\Omega)^{1/2}Z$ is a centered Gaussian
-  conditional on $\cH_n$,
-  on the event $\{\Omega \preceq M\}$.
-  We thus have by Markov's inequality,
-  Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
-  and Jensen's inequality that
-  %
-  \begin{align*}
-    \varepsilon_p(M, \eta)
-    &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
-    \Omega \preceq M\right)
-    \leq
-    \frac{1}{\eta}
-    \E\left[
-      \I\{\Omega \preceq M\}
-      \E\left[
-        \big\| (M - \Omega)^{1/2} Z \big\|_p
-        \mid \cH_n
-      \right]
-    \right] \\
-    &\leq
-    \frac{\phi_p(d)}{\eta}
-    \E\left[
-      \I\{\Omega \preceq M\}
-      \max_{1 \leq j \leq d}
-      \sqrt{(M - \Omega)_{j j}}
-    \right]
-    \leq
-    \frac{\phi_p(d)}{\eta}
-    \E\left[
-      \sqrt{\|M - \Omega\|_2}
-    \right] \\
-    &\leq
-    \frac{\phi_p(d)}{\eta}
-    \E\left[
-      \sqrt{\|\Omega\|_2} + \nu
-    \right]
-    \leq
-    \frac{\phi_p(d)}{\eta}
-    \left(\sqrt{\E[\|\Omega\|_2]} + \nu \right).
-  \end{align*}
-  %
-  Thus by Theorem~\ref{thm:yurinskii_sa_dependent} and the previous parts,
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > 6\eta\big)
-    &\leq
-    \inf_{t>0}
-    \left\{
-      2 \P\big(\|Z\|_p>t\big)
-      + \min\left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4}
-        + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\} \\
-    &\quad+
-    \inf_{M \succeq 0}
-    \big\{ 2\gamma(M) + \delta_p(M,\eta)
-    + \varepsilon_p(M, \eta)\big\}
-    +\P\big(\|U\|_p>\eta\big) \\
-    &\leq
-    \inf_{t>0}
-    \left\{
-      \frac{2 \phi_p(d)}{t}
-      + \min\left\{
-        \frac{\beta_{p,2} t^2}{\eta^3},
-        \frac{\beta_{p,3} t^3}{\eta^4}
-        + \frac{\pi_3 t^3}{\eta^3}
-      \right\}
-    \right\} \\
-    &\quad+
-    \inf_{\nu > 0}
-    \left\{ \frac{2\E \left[ \|\Omega\|_2 \right]}{\nu^2}
-      + \frac{2 \phi_p(d) \nu}{\eta}
-    \right\}
-    + \frac{\phi_p(d) \sqrt{\E \left[ \|\Omega\|_2 \right]}}{\eta}
-    +\P\big(\|U\|_p>\eta\big).
-  \end{align*}
-  %
-  Set $t = 2^{1/3} \phi_p(d)^{1/3} \beta_{p,2}^{-1/3} \eta$
-  and $\nu = \E[\|\Omega\|_2]^{1/3} \phi_p(d)^{-1/3} \eta^{1/3}$,
-  then replace $\eta$ with $\eta / 6$ to see
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > 6\eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-    \right)^{1/3}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    +\P\left(\|U\|_p>\frac{\eta}{6}\right).
-  \end{align*}
-  %
-  Whenever $\pi_3 = 0$ we can set
-  $t = 2^{1/4} \phi_p(d)^{1/4} \beta_{p,3}^{-1/4} \eta$,
-  and with $\nu$ as above we obtain
-  %
-  \begin{align*}
-    \P\big(\|S-T\|_p > \eta\big)
-    &\leq
-    24 \left(
-      \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
-    \right)^{1/4}
-    + 17 \left(
-      \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-    \right)^{1/3}
-    +\P\left(\|U\|_p>\frac{\eta}{6}\right).
-  \end{align*}
-  %
-\end{proof}
-
-After establishing Proposition~\ref{pro:yurinskii_sa_simplified},
-Corollaries~\ref{cor:yurinskii_sa_mixingale},
-\ref{cor:yurinskii_sa_martingale},
-and \ref{cor:yurinskii_sa_indep} follow easily.
-
-\begin{proof}[Corollary~\ref{cor:yurinskii_sa_mixingale}]
-  Proposition~\ref{pro:yurinskii_sa_simplified} with
-  $\P ( \|U\|_p > \frac{\eta}{6} )
-  \leq \frac{6}{\eta} \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$.
-\end{proof}
-
-\begin{proof}[Corollary~\ref{cor:yurinskii_sa_martingale}]
-  By Proposition~\ref{pro:yurinskii_sa_simplified}
-  with $U=0$ a.s.
-\end{proof}
-
-\begin{proof}[Corollary~\ref{cor:yurinskii_sa_indep}]
-  By Corollary~\ref{cor:yurinskii_sa_martingale}
-  with $\Omega=0$ a.s.
-\end{proof}
-
-We conclude this section with a discussion expanding on the comments made
-in Remark~\ref{rem:yurinskii_coupling_bounds_probability} on deriving bounds in
-probability from Yurinskii's coupling. Consider for illustration the
-independent data second-order result given in
-Corollary~\ref{cor:yurinskii_sa_indep}: for each $\eta > 0$,
-there exists $T_n \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying
-%
-\begin{align*}
-  \P\big(\|S_n-T_n\|_p > \eta\big)
-  &\leq
-  24 \left(
-    \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-  \right)^{1/3},
-\end{align*}
-%
-where here we make explicit the dependence on the sample size $n$ for clarity.
-The naive approach to converting this into a probability bound for
-$\|S_n-T_n\|_p$ is to select $\eta$ to ensure the right-hand side is
-of order $1$, arguing that the probability can then be made arbitrarily
-small by taking, in this case, $\eta$ to be a large enough multiple of
-$\beta_{p,2}^{1/3} \phi_p(d)^{2/3}$. However, the somewhat subtle mistake is
-in neglecting the fact that the realization of the coupling variable $T_n$
-will in general depend on $\eta$, rendering the resulting
-bound invalid.
-As an explicit example of this phenomenon, take $\eta > 1$ and suppose
-$\|S_n - T_n(\eta)\| = \eta$ with probability $1 - 1/\eta$ and
-$\|S_n - T_n(\eta)\| = n$ with probability $1/\eta$.
-Then $\P\big(\|S_n - T_n(\eta)\| > \eta\big) = 1/\eta$
-but it is not true for any $\eta$ that $\|S_n - T_n(\eta)\| \lesssim_\P 1$.
-
-We propose in Remark~\ref{rem:yurinskii_coupling_bounds_probability} the
-following fix.
-Instead of selecting $\eta$ to ensure the right-hand side is of order $1$,
-we instead choose it so the bound converges (slowly) to zero. This is
-easily achieved by taking the naive and incorrect bound and multiplying
-by some divergent sequence $R_n$. The resulting inequality reads,
-in the case of Corollary~\ref{cor:yurinskii_sa_indep} with
-$\eta = \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n$,
-%
-\begin{align*}
-  \P\Big(\|S_n-T_n\|_p >
-    \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n
-  \Big)
-  &\leq
-  \frac{24}{R_n}
-  \to 0.
-\end{align*}
-%
-We thus recover, for the price of a rate which is slower by an arbitrarily
-small amount, a valid upper bound in probability, as we can immediately
-conclude that
-%
-\begin{align*}
-  \|S_n-T_n\|_p
-  \lesssim_\P
-  \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n.
-\end{align*}
-
-\subsection{Strong approximation for martingale empirical processes}
-
-We begin by presenting some calculations omitted from the main text
-relating to the motivating example of kernel density estimation with
-i.i.d.\ data.
-First, the bias is bounded as
-%
-\begin{align*}
-  \big| \E \big[ \hat g(x) \big] - g(x) \big|
-  &=
-  \left|
-  \int_{\frac{-x}{h}}^{\frac{1-x}{h}}
-  K(\xi)
-  \diff \xi
-  - 1
-  \right|
-  \leq
-  2 \int_{\frac{a}{h}}^\infty
-  \frac{1}{\sqrt{2 \pi}}
-  e^{-\frac{\xi^2}{2}}
-  \diff \xi
-  \leq
-  \frac{h}{a}
-  \sqrt{\frac{2}{\pi}}
-  e^{-\frac{a^2}{2 h^2}}.
-\end{align*}
-%
-Next, we do the calculations necessary to apply
-Corollary~\ref{cor:yurinskii_sa_indep}.
-Define $k_{i j} = \frac{1}{n h} K \left( \frac{X_i - x_j}{h} \right)$ and
-$k_i = (k_{i j} : 1 \leq j \leq N)$.
-Then $\|k_i\|_\infty \leq \frac{1}{n h \sqrt{2 \pi}}$ a.s.\ and
-$\E[\|k_i\|_2^2] \leq \frac{N}{n^2 h} \int_{-\infty}^\infty K(\xi)^2 \diff \xi
-\leq \frac{N}{2 n^2 h \sqrt{\pi}}$.
-Let $V = \Var[k_i] \in \R^{N \times N}$,
-so assuming that $1/h \geq \log 2 N$,
-by Lemma~\ref{lem:yurinskii_app_gaussian_useful} we bound
-%
-\begin{align*}
-  \beta_{\infty,2}
-  &=
-  n \E\left[\| k_i \|^2_2 \| k_i \|_\infty
-  \right]
-  + n \E \left[ \|V^{1/2} Z \|^2_2 \|V^{1/2} Z \|_\infty \right]
-  \leq
-  \frac{N}{\sqrt{8} n^2 h^2 \pi}
-  + \frac{4 N \sqrt{\log 2 N}}{\sqrt{8} n^2 h^{3/2} \pi^{3/4}}
-  \leq
-  \frac{N}{n^2 h^2}.
-\end{align*}
-%
-Finally, we verify the stochastic continuity bounds.
-By the Lipschitz property of $K$, it is easy to show that
-for $x,x' \in \cX$ we have
-$\left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right)
-- \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|
-\lesssim \frac{|x-x'|}{h^2}$ almost surely, and also that
-$\E \Big[ \left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right)
-- \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|^2 \Big]
-\lesssim \frac{|x-x'|^2}{h^3}$.
-By chaining with the Bernstein--Orlicz norm and polynomial covering numbers,
-%
-\begin{align*}
-  \sup_{|x-x'| \leq \delta}
-  \big\|S(x) - S(x')\big\|_\infty
-  \lesssim_\P
-  \delta
-  \sqrt{\frac{\log n}{n h^3}}
-\end{align*}
-%
-whenever $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$.
-By a Gaussian process maximal inequality
-\citep[Corollary~2.2.8]{van1996weak}
-the same bound holds for $T(x)$ with
-%
-\begin{align*}
-  \sup_{|x-x'| \leq \delta}
-  \big\|T(x) - T(x')\big\|_\infty
-  \lesssim_\P
-  \delta
-  \sqrt{\frac{\log n}{n h^3}}.
-\end{align*}
-
-\begin{proof}[Lemma~\ref{lem:yurinskii_kde_eigenvalue}]
-
-  For $x, x' \in [a, 1-a]$, the scaled covariance function
-  of this nonparametric estimator is
-  %
-  \begin{align*}
-    n h\, \Cov\big[\hat g(x), \hat g(x')\big]
-    &=
-    \frac{1}{h}
-    \E \left[
-      K \left( \frac{X_i - x}{h} \right)
-      K \left( \frac{X_i - x'}{h} \right)
-    \right] \\
-    &\quad-
-    \frac{1}{h}
-    \E \left[
-      K \left( \frac{X_i - x}{h} \right)
-    \right]
-    \E \left[
-      K \left( \frac{X_i - x'}{h} \right)
-    \right] \\
-    &=
-    \frac{1}{2 \pi}
-    \int_{\frac{-x}{h}}^{\frac{1-x}{h}}
-    \exp \left( - \frac{t^2}{2} \right)
-    \exp \left( - \frac{1}{2} \left( t + \frac{x - x'}{h} \right)^2 \right)
-    \diff t
-    - h I(x) I(x')
-  \end{align*}
-  %
-  where
-  $I(x) = \frac{1}{\sqrt 2 \pi} \int_{-x/h}^{(1-x)/h} e^{-t^2/2} \diff t$.
-  Completing the square and a substitution gives
-  %
-  \begin{align*}
-    n h\, \Cov\big[\hat g(x), \hat g(x')\big]
-    &=
-    \frac{1}{2 \pi}
-    \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right)
-    \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}}
-    \exp \left(-t^2\right)
-    \diff t
-    - h I(x) I(x').
-  \end{align*}
-  %
-  Now we show that since $x, x'$ are not too close to the boundary
-  of $[0,1]$,
-  the limits in the above integral can be replaced by $\pm \infty$.
-  Note that $\frac{-x-x'}{2h} \leq \frac{-a}{h}$
-  and $\frac{2-x-x'}{2h} \geq \frac{a}{h}$ so
-  %
-  \begin{align*}
-    \int_{-\infty}^{\infty}
-    \exp \left(-t^2\right)
-    \diff t
-    - \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}}
-    \exp \left(-t^2\right)
-    \diff t
-    \leq
-    2 \int_{a/h}^\infty
-    \exp \left(-t^2\right)
-    \diff t
-    \leq
-    \frac{h}{a}
-    \exp \left(- \frac{a^2}{h^2}\right).
-  \end{align*}
-  %
-  Therefore, since
-  $\int_{-\infty}^{\infty} e^{-t^2} \diff t = \sqrt \pi$,
-  %
-  \begin{align*}
-    \left|
-    n h\, \Cov\big[\hat g(x), \hat g(x')\big]
-    - \frac{1}{2 \sqrt \pi}
-    \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right)
-    + h I(x) I(x')
-    \right|
-    \leq
-    \frac{h}{2 \pi a}
-    \exp \left(- \frac{a^2}{h^2}\right).
-  \end{align*}
-  %
-  Define the $N \times N$ matrix
-  $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi}
-  \exp \left( - \frac{1}{4} \left( \frac{x_i-x_j}{h} \right)^2 \right)$.
-  By \citet[Proposition~2.4,
-  Proposition~2.5, and Equation~2.10]{baxter1994norm},
-  with
-  $\cB_k = \big\{b \in \R^\Z :
-  \sum_{i \in \Z} \I\{b_i \neq 0\} \leq k \big\}$,
-  %
-  \begin{align*}
-    \inf_{k \in \N}
-    \inf_{b \in \R^k}
-    \frac{\sum_{i=1}^k \sum_{j=1}^k b_i b_j \, e^{-\lambda(i-j)^2}}
-    {\sum_{i=1}^k b_i^2}
-    =
-    \sqrt{\frac{\pi}{\lambda}}
-    \sum_{i=-\infty}^{\infty}
-    \exp \left( - \frac{(\pi e + 2 \pi i)^2}{4 \lambda} \right).
-  \end{align*}
-  %
-  We use Riemann sums,
-  noting that $\pi e + 2 \pi x = 0$ at
-  $x = -e/2 \approx -1.359$.
-  Consider the substitutions
-  $\Z \cap (-\infty, -3] \mapsto (-\infty, -2]$,
-  $\{-2, -1\} \mapsto \{-2, -1\}$, and
-  $\Z \cap [0, \infty) \mapsto [-1, \infty)$.
-  %
-  \begin{align*}
-    \sum_{i \in \Z}
-    e^{-(\pi e + 2 \pi i)^2 / 4 \lambda}
-    &\leq
-    \int_{-\infty}^{-2}
-    e^{ - (\pi e + 2 \pi x)^2/4 \lambda}
-    \diff x
-    + e^{- (\pi e - 4 \pi)^2/4 \lambda} \\
-    &\quad+
-    e^{ - (\pi e - 2 \pi)^2 / 4 \lambda}
-    + \int_{-1}^{\infty}
-    e^{ -(\pi e + 2 \pi x)^2 / 4 \lambda}
-    \diff x.
-  \end{align*}
-  %
-  Now use the substitution $t = \frac{\pi e + 2 \pi x}{2 \sqrt \lambda}$
-  and suppose $\lambda < 1$, yielding
-  %
-  \begin{align*}
-    \sum_{i \in \Z}
-    e^{-(\pi e + 2 \pi i)^2 / 4 \lambda}
-    &\leq
-    \frac{\sqrt \lambda}{\pi}
-    \int_{-\infty}^{\frac{\pi e - 4 \pi}{2 \sqrt \lambda}}
-    e^{-t^2}
-    \diff t
-    + e^{- (\pi e - 4 \pi)^2/4 \lambda}
-    + e^{ - (\pi e - 2 \pi)^2 / 4 \lambda}
-    + \frac{\sqrt \lambda}{\pi}
-    \int_{\frac{\pi e - 2 \pi}{2 \sqrt \lambda}}^{\infty}
-    e^{-t^2}
-    \diff t \\
-    &\leq
-    \left( 1 + \frac{1}{\pi} \frac{\lambda}{4 \pi - \pi e} \right)
-    e^{-(\pi e - 4 \pi)^2 / 4 \lambda}
-    +
-    \left( 1 + \frac{1}{\pi} \frac{\lambda}{\pi e - 2 \pi} \right)
-    e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \\
-    &\leq
-    \frac{13}{12}
-    e^{-(\pi e - 4 \pi)^2 / 4 \lambda}
-    +
-    \frac{8}{7}
-    e^{- (\pi e - 2 \pi)^2 / 4 \lambda}
-    \leq
-    \frac{9}{4}
-    \exp \left( - \frac{5}{4 \lambda} \right).
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \inf_{k \in \N}
-    \inf_{b \in \cB_k}
-    \frac{\sum_{i \in \Z} \sum_{j \in \Z} b_i b_j \, e^{-\lambda(i-j)^2}}
-    {\sum_{i \in \Z} b_i^2}
-    < \frac{4}{\sqrt \lambda}
-    \exp \left( - \frac{5}{4 \lambda} \right)
-    < 4 e^{-1/\lambda}.
-  \end{align*}
-  %
-  From this and since
-  $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} e^{-\lambda(i-j)^2}$
-  with $\lambda = \frac{1}{4(N-1)^2 h^2} \leq \frac{\delta^2}{h^2}$,
-  for each $h$ and some $\delta \leq h$,
-  we have $\lambda_{\min}(\tilde\Sigma) \leq 2 e^{-h^2/\delta^2}$.
-  Recall that
-  %
-  \begin{align*}
-    \left|
-    \Sigma_{i j}
-    - \tilde\Sigma_{i j}
-    + h I(x_i) I(x_j)
-    \right|
-    \leq
-    \frac{h}{2 \pi a}
-    \exp \left(- \frac{a^2}{h^2}\right).
-  \end{align*}
-  %
-  For any positive semi-definite $N \times N$ matrices $A$ and $B$
-  and vector $v$ we have $\lambda_{\min}(A - v v^\T) \leq \lambda_{\min}(A)$
-  and $\lambda_{\min}(B) \leq \lambda_{\min}(A) + \|B-A\|_2
-  \leq \lambda_{\min}(A) + N \|B-A\|_{\max}$.
-  Hence with $I_i = I(x_i)$,
-  %
-  \begin{align*}
-    \lambda_{\min}(\Sigma)
-    &\leq
-    \lambda_{\min}(\tilde\Sigma - h I I^\T)
-    + \frac{N h}{2 \pi a}
-    \exp \left(- \frac{a^2}{h^2}\right)
-    \leq
-    2 e^{-h^2/\delta^2}
-    + \frac{h}{\pi a \delta}
-    e^{-a^2 / h^2}.
-  \end{align*}
-\end{proof}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_emp_proc}]
-
-  Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$.
-  Using a union bound, we can write
-  %
-  \begin{align*}
-    &\P\left(\sup_{f \in \cF}
-      \big| S(f) - T(f) \big|
-    \geq 2t + \eta \right)
-    \leq
-    \P\left(\sup_{f \in \cF_\delta}
-      \big| S(f) - T(f) \big|
-    \geq \eta \right) \\
-    &\qquad\qquad+
-    \P\left(\sup_{d(f,f') \leq \delta}
-      \big| S(f) - S(f') \big|
-    \geq t \right)
-    + \P\left(\sup_{d(f,f') \leq \delta}
-      \big| T(f) - T(f') \big|
-    \geq t \right).
-  \end{align*}
-
-  \proofparagraph{bounding the difference on $\cF_\delta$}
-
-  We apply Corollary~\ref{cor:yurinskii_sa_martingale}
-  with $p = \infty$ to the
-  martingale difference sequence
-  $\cF_\delta(X_i) = \big(f(X_i) : f \in \cF_\delta\big)$
-  which takes values in $\R^{|\cF_\delta|}$.
-  Square integrability can be assumed otherwise
-  $\beta_\delta = \infty$.
-  Note $\sum_{i=1}^n \cF_\delta(X_i) = S(\cF_\delta)$
-  and $\phi_\infty(\cF_\delta) \leq \sqrt{2 \log 2 |\cF_\delta|}$.
-  Therefore there exists a conditionally Gaussian vector $T(\cF_\delta)$
-  with the same covariance structure as $S(\cF_\delta)$
-  conditional on $\cH_0$ satisfying
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{f \in \cF_\delta}
-      \big| S(f) - T(f) \big|
-      \geq \eta
-    \right)
-    &\leq
-    \frac{24\beta_\delta^{\frac{1}{3}}
-    (2\log 2 |\cF_\delta|)^{\frac{1}{3}}}{\eta}
-    + 17\left(\frac{\sqrt{2 \log 2 |\cF_\delta|}
-    \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{\frac{2}{3}}.
-  \end{align*}
-
-  \proofparagraph{bounding the fluctuations in $S(f)$}
-
-  Since $\big\| S(f) - S(f') \big\|_\psi \leq L d(f,f')$,
-  by Theorem~2.2.4 in \citet{van1996weak}
-  %
-  \begin{align*}
-    \left\|
-    \sup_{d(f,f') \leq \delta}
-    \big| S(f) - S(f') \big|
-    \right\|_\psi
-    &\leq
-    C_\psi L
-    \left(
-      \int_0^\delta
-      \psi^{-1}(N_\varepsilon) \diff{\varepsilon}
-      + \delta \psi^{-1}(N_\delta^2)
-    \right)
-    = C_\psi L J_\psi(\delta).
-  \end{align*}
-  %
-  Then, by Markov's inequality and the definition of the Orlicz norm,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{d(f,f') \leq \delta}
-      \big| S(f) - S(f') \big|
-      \geq t
-    \right)
-    &\leq
-    \psi\left(\frac{t}{C_\psi L J_\psi(\delta)} \right)^{-1}.
-  \end{align*}
-
-  \proofparagraph{bounding the fluctuations in $T(f)$}
-
-  By the Vorob'ev--Berkes--Philipp theorem
-  \citep{dudley1999uniform},
-  $T(\cF_\delta)$ extends to a conditionally Gaussian process $T(f)$.
-  Firstly, since
-  $\bigvvvert T(f) - T(f') \bigvvvert_2 \leq L d(f,f')$
-  conditionally on $\cH_0$,
-  and $T(f)$ is a conditional Gaussian process, we have
-  $\big\| T(f) - T(f') \big\|_{\psi_2} \leq 2 L d(f,f')$
-  conditional on $\cH_0$
-  by \citet[Chapter~2.2, Complement~1]{van1996weak},
-  where $\psi_2(x) = \exp(x^2) - 1$.
-  Thus again by Theorem~2.2.4 in \citet{van1996weak},
-  again conditioning on $\cH_0$,
-  %
-  \begin{align*}
-    \left\|
-    \sup_{d(f,f') \leq \delta}
-    \big| T(f) - T(f') \big|
-    \right\|_{\psi_2}
-    &\leq
-    C_1 L
-    \int_0^\delta
-    \sqrt{\log N_\varepsilon} \diff{\varepsilon}
-    = C_1 L J_2(\delta)
-  \end{align*}
-  %
-  for some universal constant $C_1 > 0$,
-  where we used $\psi_2^{-1}(x) = \sqrt{\log(1+x)}$
-  and monotonicity of covering numbers.
-  Then by Markov's inequality and the definition of the Orlicz norm,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{d(f,f') \leq \delta}
-      \big| T(f) - T(f') \big|
-      \geq t
-    \right)
-    &\leq
-    \left(
-      \exp\left(
-        \frac{t^2}{C_1^2 L^2 J_2(\delta)^2}
-      \right) - 1
-    \right)^{-1}
-    \!\vee 1
-    \leq
-    2 \exp\left(
-      \frac{-t^2}{C_1^2 L^2 J_2(\delta)^2}
-    \right).
-  \end{align*}
-  %
-
-  \proofparagraph{conclusion}
-
-  The result follows by scaling $t$ and $\eta$
-  and enlarging constants if necessary.
-  %
-\end{proof}
-
-\subsection{Applications to nonparametric regression}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_series}]
-
-  Proceed according to the decomposition in
-  Section~\ref{sec:yurinskii_series}.
-  By stationarity and Lemma~SA-2.1 in
-  \citet{cattaneo2020large},
-  we have $\sup_w \|p(w)\|_1 \lesssim 1$
-  and also $\|H\|_1 \lesssim n/k$
-  and $\|H^{-1}\|_1 \lesssim k/n$.
-
-  \proofparagraph{bounding $\beta_{\infty,2}$ and $\beta_{\infty,3}$}
-
-  Set $X_i = p(W_i) \varepsilon_i$
-  so $S = \sum_{i=1}^n X_i$,
-  and set $\sigma^2_i = \sigma^2(W_i)$ and
-  $V_i = \Var[X_i \mid \cH_{i-1}] = \sigma_i^2 p(W_i) p(W_i)^\T$.
-  Recall from Corollary~\ref{cor:yurinskii_sa_martingale} that for
-  $r \in \{2,3\}$,
-  %
-  \begin{align*}
-    \beta_{\infty,r}
-    = \sum_{i=1}^n \E\left[\| X_i \|^r_2 \| X_i \|_\infty
-    + \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right]
-  \end{align*}
-  %
-  with $Z_i \sim \cN(0,1)$ i.i.d.\ and independent of $V_i$.
-  For the first term, we use
-  $\sup_w \|p(w)\|_2 \lesssim 1$
-  and bounded third moments of $\varepsilon_i$:
-  %
-  \begin{align*}
-    \E\left[ \| X_i \|^r_2 \| X_i \|_\infty \right]
-    &\leq
-    \E\left[ |\varepsilon_i|^3 \| p(W_i) \|^{r+1}_2 \right]
-    \lesssim 1.
-  \end{align*}
-  %
-  For the second term, apply Lemma~\ref{lem:yurinskii_app_gaussian_useful}
-  conditionally on
-  $\cH_n$ with $\sup_w \|p(w)\|_2 \lesssim 1$ to see
-  %
-  \begin{align*}
-    &\E\left[ \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right]
-    \lesssim
-    \sqrt{\log 2k} \
-    \E\left[
-      \max_{1 \leq j \leq k}
-      (V_i)_{j j}^{1/2}
-      \bigg( \sum_{j=1}^k (V_i)_{j j} \bigg)^{r/2}
-    \right] \\
-    &\quad\lesssim
-    \sqrt{\log 2k} \
-    \E\left[
-      \sigma_i^{r+1}
-      \max_{1 \leq j \leq k}
-      p(W_i)_j
-      \bigg(
-        \sum_{j=1}^k
-        p(W_i)_{j}^2
-      \bigg)^{r/2}
-    \right]
-    \lesssim
-    \sqrt{\log 2k} \
-    \E\left[
-      \sigma_i^{r+1}
-    \right]
-    \lesssim
-    \sqrt{\log 2k}.
-  \end{align*}
-  %
-  Putting these together yields
-  %
-  $\beta_{\infty,2} \lesssim n \sqrt{\log 2k}$
-  and $\beta_{\infty,3} \lesssim n \sqrt{\log 2k}$.
-
-  \proofparagraph{bounding $\Omega$}
-
-  Set $\Omega = \sum_{i=1}^n \big(V_i - \E[V_i] \big)$ so
-  %
-  \begin{align*}
-    \Omega
-    &= \sum_{i=1}^n
-    \big(\sigma_i^2 p(W_i)p(W_i)^\T - \E\left[ \sigma_i^2 p(W_i)p(W_i)^\T
-    \right]\big).
-  \end{align*}
-  %
-  Observe that $\Omega_{j l}$ is the sum of a zero-mean
-  strictly stationary $\alpha$-mixing sequence and so $\E[\Omega_{j l}^2]
-  \lesssim n$ by
-  Lemma~\ref{lem:yurinskii_app_variance_mixing}%
-  \ref{it:yurinskii_app_variance_mixing_bounded}.
-  Since the basis functions
-  satisfy Assumption~3 in \citet{cattaneo2020large}, $\Omega$ has a bounded
-  number of non-zero entries in each row, so by Jensen's inequality
-  %
-  \begin{align*}
-    \E\left[
-      \|\Omega\|_2
-    \right]
-    &\leq
-    \E\left[
-      \|\Omega\|_\rF
-    \right]
-    \leq
-    \left(
-      \sum_{j=1}^k
-      \sum_{l=1}^k
-      \E\left[
-        \Omega_{j l}^2
-      \right]
-    \right)^{1/2}
-    \lesssim \sqrt{n k}.
-  \end{align*}
-  %
-
-  \proofparagraph{strong approximation}
-
-  By Corollary~\ref{cor:yurinskii_sa_martingale} and the previous parts,
-  with any sequence $R_n \to \infty$,
-  %
-  \begin{align*}
-    \|S - T \|_\infty
-    &\lesssim_\P
-    \beta_{\infty,2}^{1/3} (\log 2k)^{1/3} R_n
-    + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \\
-    &\lesssim_\P
-    n^{1/3} \sqrt{\log 2k} R_n
-    + (n k)^{1/4} \sqrt{\log 2k} R_n.
-  \end{align*}
-  %
-  If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
-  the third-order version of Corollary~\ref{cor:yurinskii_sa_martingale}
-  applies since
-  %
-  \begin{align*}
-    \pi_3
-    &=
-    \sum_{i=1}^{n}
-    \sum_{|\kappa| = 3}
-    \E \Big[ \big|
-      \E [ X_i^\kappa \mid \cH_{i-1} ]
-    \big| \Big]
-    =
-    \sum_{i=1}^{n}
-    \sum_{|\kappa| = 3}
-    \E \Big[ \big|
-      p(W_i)^\kappa \,
-      \E [ \varepsilon_i^3 \mid \cH_{i-1} ]
-    \big| \Big]
-    = 0,
-  \end{align*}
-  %
-  giving
-  %
-  \begin{align*}
-    \|S - T \|_\infty
-    &\lesssim_\P
-    \beta_{\infty,3}^{1/4} (\log 2k)^{3/8} R_n
-    + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n
-    \lesssim_\P
-    (n k)^{1/4} \sqrt{\log 2k} R_n.
-  \end{align*}
-  %
-  By H{\"o}lder's inequality and with
-  $\|H^{-1}\|_1 \lesssim k/n$ we have
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    p(w)^\T H^{-1} S
-    - p(w)^\T H^{-1} T
-    \right|
-    &\leq
-    \sup_{w \in \cW}
-    \|p(w)\|_1
-    \|H^{-1}\|_1
-    \| S - T \|_\infty
-    \lesssim
-    n^{-1} k
-    \| S - T \|_\infty.
-  \end{align*}
-
-  \proofparagraph{convergence of $\hat H$}
-
-  We have
-  $\hat H - H = \sum_{i=1}^n \big(p(W_i)p(W_i)^\T - \E\left[
-  p(W_i)p(W_i)^\T \right]\big)$.
-  Observe that $(\hat H - H)_{j l}$ is the sum of
-  a zero-mean strictly stationary $\alpha$-mixing sequence and so
-  $\E[(\hat H - H)_{j l}^2] \lesssim n$ by
-  Lemma~\ref{lem:yurinskii_app_variance_mixing}%
-  \ref{it:yurinskii_app_variance_mixing_bounded}.
-  Since the basis
-  functions satisfy Assumption~3 in \citet{cattaneo2020large},
-  $\hat H-H$ has a
-  bounded number of non-zero entries in each row and so by Jensen's inequality
-  %
-  \begin{align*}
-    \E\left[
-      \|\hat H-H\|_1
-    \right]
-    &=
-    \E\left[
-      \max_{1 \leq i \leq k}
-      \sum_{j=1}^k
-      \big|(\hat H-H)_{i j}\big|
-    \right]
-    \leq
-    \E\left[
-      \sum_{1 \leq i \leq k}
-      \Bigg(
-        \sum_{j=1}^k
-        |(\hat H-H)_{i j}|
-      \Bigg)^2
-    \right]^{\frac{1}{2}}
-    \lesssim \sqrt{n k}.
-  \end{align*}
-
-  \proofparagraph{bounding the matrix term}
-
-  Note $\|\hat H^{-1}\|_1 \leq \|H^{-1}\|_1
-  + \|\hat H^{-1}\|_1 \|\hat H-H\|_1 \|H^{-1}\|_1$
-  so by the previous part, we deduce
-  %
-  \begin{align*}
-    \|\hat H^{-1}\|_1
-    \leq
-    \frac{\|H^{-1}\|_1}
-    {1 - \|\hat H-H\|_1 \|H^{-1}\|_1}
-    \lesssim_\P
-    \frac{k/n}
-    {1 - \sqrt{n k}\, k/n}
-    \lesssim_\P
-    \frac{k}{n}
-  \end{align*}
-  %
-  as $k^3 / n \to 0$. Note that by the martingale structure, since
-  $p(W_i)$ is bounded and supported on a region with volume at most of the order
-  $1/k$, and as $W_i$ has a Lebesgue density,
-  %
-  \begin{align*}
-    \Var[T_j]
-    &=
-    \Var[S_j]
-    =
-    \Var\left[
-      \sum_{i=1}^n \varepsilon_i p(W_i)_j
-    \right]
-    =
-    \sum_{i=1}^n
-    \E\left[
-      \sigma_i^2 p(W_i)_j^2
-    \right]
-    \lesssim
-    \frac{n}{k}.
-  \end{align*}
-  %
-  So by the Gaussian maximal inequality in
-  Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
-  $\|T\|_\infty \lesssim_\P \sqrt{\frac{n \log 2k}{k}}$.
-  Since $k^3/n \to 0$,
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    p(w)^\T (\hat H^{-1} - H^{-1}) S
-    \right|
-    &\leq
-    \sup_{w \in \cW}
-    \|p(w)^\T\|_1
-    \|\hat H^{-1}\|_1
-    \|\hat H - H\|_1
-    \|H^{-1}\|_1
-    \|S - T\|_\infty \\
-    &\quad+
-    \sup_{w \in \cW}
-    \|p(w)^\T\|_1
-    \|\hat H^{-1}\|_1
-    \|\hat H - H\|_1
-    \|H^{-1}\|_1
-    \|T\|_\infty \\
-    &\lesssim_\P
-    \frac{k^2}{n^2}
-    \sqrt{n k}
-    \!\left(
-      n^{1/3} \sqrt{\log 2k}
-      + (n k)^{1/4} \sqrt{\log 2k}
-    \right)
-    \!+ \frac{k^2}{n^2}
-    \sqrt{n k}
-    \sqrt{\frac{n \log 2k}{k}} \\
-    &\lesssim_\P
-    \frac{k^2}{n}
-    \sqrt{\log 2k}.
-  \end{align*}
-  %
-
-  \proofparagraph{conclusion of the main result}
-
-  By the previous parts,
-  with $G(w) = p(w)^\T H^{-1} T$,
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \left|
-    \hat\mu(w) - \mu(w)
-    - p(w)^\T H^{-1} T
-    \right| \\
-    &\quad=
-    \sup_{w \in \cW}
-    \left|
-    p(w)^\T H^{-1} (S - T)
-    + p(w)^\T (\hat H^{-1} - H^{-1}) S
-    + \Bias(w)
-    \right| \\
-    &\quad\lesssim_\P
-    \frac{k}{n}
-    \|S - T\|_\infty
-    + \frac{k^2}{n} \sqrt{\log 2k}
-    + \sup_{w \in \cW} |\Bias(w)| \\
-    &\quad\lesssim_\P
-    \frac{k}{n}
-    \left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) R_n
-    + \frac{k^2}{n} \sqrt{\log 2k}
-    + \sup_{w \in \cW} |\Bias(w)| \\
-    &\quad\lesssim_\P
-    n^{-2/3} k \sqrt{\log 2k} R_n
-    + n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n
-    + \frac{k^2}{n} \sqrt{\log 2k}
-    + \sup_{w \in \cW} |\Bias(w)| \\
-    &\quad\lesssim_\P
-    n^{-2/3} k \sqrt{\log 2k} R_n
-    + \sup_{w \in \cW} |\Bias(w)|
-  \end{align*}
-  %
-  since $k^3/n \to 0$.
-  If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    \hat\mu(w) - \mu(w)
-    - p(w)^\T H^{-1} T
-    \right|
-    &\lesssim_\P
-    \frac{k}{n}
-    \|S - T\|_\infty
-    + \frac{k^2}{n} \sqrt{\log 2k}
-    + \sup_{w \in \cW} |\Bias(w)| \\
-    &\lesssim_\P
-    n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n
-    + \sup_{w \in \cW} |\Bias(w)|.
-  \end{align*}
-  %
-  Finally, we verify the variance bounds for the Gaussian process.
-  With $\sigma^2(w)$ bounded above,
-  %
-  \begin{align*}
-    \Var[G(w)]
-    &=
-    p(w)^\T H^{-1}
-    \Var\left[ \sum_{i=1}^n p(W_i) \varepsilon_i \right]
-    H^{-1} p(w) \\
-    &=
-    p(w)^\T H^{-1}
-    \E\left[\sum_{i=1}^n p(W_i) p(W_i)^\T \sigma^2(W_i) \right]
-    H^{-1} p(w) \\
-    &\lesssim
-    \|p(w)\|_2^2 \|H^{-1}\|_2^2
-    \|H\|_2
-    \lesssim
-    k/n.
-  \end{align*}
-  %
-  Similarly, since $\sigma^2(w)$ is bounded away from zero,
-  %
-  \begin{align*}
-    \Var[G(w)]
-    &\gtrsim
-    \|p(w)\|_2^2 \|H^{-1}\|_2^2
-    \|H^{-1}\|_2^{-1}
-    \gtrsim
-    k/n.
-  \end{align*}
-
-  \proofparagraph{bounding the bias}
-
-  We delegate the task of carefully deriving bounds on the bias to
-  \citet{cattaneo2020large}, who provide a high-level assumption on the
-  approximation error in Assumption~4 and then use it to derive bias bounds in
-  Section~3 of the form $\sup_{w \in \cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$.
-  This assumption is then verified for B-splines, wavelets, and piecewise
-  polynomials in their supplemental appendix.
-
-\end{proof}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_series_feasible}]
-  \proofparagraph{infeasible supremum approximation}
-
-  Provided that the bias is negligible,
-  for all $s > 0$ we have
-  %
-  \begin{align*}
-    &\sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}}
-      \right| \leq t
-    \right)
-    -
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \frac{G(w)}{\sqrt{\rho(w,w)}}
-      \right| \leq t
-    \right)
-    \right| \\
-    &\quad\leq
-    \sup_{t \in \R}
-    \P\left(
-      t \leq
-      \sup_{w \in \cW}
-      \left|
-      \frac{G(w)}{\sqrt{\rho(w,w)}}
-      \right|
-      \leq t + s
-    \right)
-    +
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \frac{\hat\mu(w)-\mu(w)-G(w)}{\sqrt{\rho(w,w)}}
-      \right| > s
-    \right).
-  \end{align*}
-  %
-  By the Gaussian anti-concentration result given as Corollary~2.1 in
-  \citet{chernozhukov2014anti} applied to a discretization of $\cW$, the first
-  term is at most $s \sqrt{\log n}$ up to a constant factor, and the second
-  term converges to zero whenever
-  $\frac{1}{s} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} \to 0$.
-  Thus a suitable value of $s$ exists whenever $\frac{k^3(\log n)^6}{n} \to 0$.
-
-  \proofparagraph{feasible supremum approximation}
-
-  By \citet[Lemma~3.1]{chernozhukov2013gaussian} and discretization,
-  with $\rho(w,w') = \E[\hat\rho(w,w')]$,
-  %
-  \begin{align*}
-    &\sup_{t \in \R}
-    \left|
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
-      \right|
-      \leq t \biggm| \bW, \bY
-    \right)
-    - \P\left(
-      \left|
-      \frac{G(w)}{\sqrt{\rho(w,w)}}
-      \right|
-      \leq t
-    \right)
-    \right| \\
-    &\quad\lesssim_\P
-    \sup_{w,w' \in \cW}
-    \left|
-    \frac{\hat\rho(w,w')}
-    {\sqrt{\hat\rho(w,w)\hat\rho(w',w')}}
-    - \frac{\rho(w,w')}
-    {\sqrt{\rho(w,w)\rho(w',w')}}
-    \right|^{1/3}
-    (\log n)^{2/3} \\
-    &\quad\lesssim_\P
-    \left(\frac n k \right)^{1/3}
-    \sup_{w,w' \in \cW} |\hat\rho(w,w') - \rho(w,w')|^{1/3}
-    (\log n)^{2/3} \\
-    &\quad\lesssim_\P
-    \left( \frac{n (\log n)^2}{k} \right)^{1/3}
-    \sup_{w,w' \in \cW}
-    \left|
-    p(w)^\T \hat H^{-1}
-    \left(
-      \hat{V}[S]
-      - \Var[S]
-    \right)
-    \hat H^{-1} p(w')
-    \right|^{1/3} \\
-    &\quad\lesssim_\P
-    \left( \frac{k (\log n)^2}{n} \right)^{1/3}
-    \left\|
-    \hat{V}[S]
-    - \Var[S]
-    \right\|_2^{1/3},
-  \end{align*}
-  %
-  and vanishes in probability whenever
-  $\frac{k (\log n)^2}{n}
-  \big\| \hat{V}[S] - \Var[S] \big\|_2 \to_\P 0$.
-  For the plug-in estimator,
-  %
-  \begin{align*}
-    &\left\|
-    \hat{V}[S]
-    - \Var[S]
-    \right\|_2
-    =
-    \left\|
-    \sum_{i=1}^n
-    p(W_i) p(W_i^\T)
-    \hat\sigma^2(W_i)
-    - n \E\left[
-      p(W_i) p(W_i^\T)
-      \sigma^2(W_i)
-    \right]
-    \right\|_2 \\
-    &\quad\lesssim_\P
-    \sup_{w \in \cW}
-    |\hat{\sigma}^2(w)-\sigma^2(w)|
-    \, \big\| \hat H \big\|_2 \\
-    &\qquad+
-    \left\|
-    \sum_{i=1}^n
-    p(W_i) p(W_i^\T)
-    \sigma^2(W_i)
-    - n \E\left[
-      p(W_i) p(W_i^\T)
-      \sigma^2(W_i)
-    \right]
-    \right\|_2 \\
-    &\quad\lesssim_\P
-    \frac{n}{k}
-    \sup_{w \in \cW}
-    |\hat{\sigma}^2(w)-\sigma^2(w)|
-    + \sqrt{n k},
-  \end{align*}
-  %
-  where the second term is bounded by the same argument
-  used to bound $\|\hat H - H\|_1$.
-  Thus, the feasible approximation is valid whenever
-  $(\log n)^2 \sup_{w \in \cW}
-  |\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$
-  and $\frac{k^3 (\log n)^4}{n} \to 0$.
-  The validity of the uniform confidence band follows immediately.
-  %
-\end{proof}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_local_poly}]
-
-  We apply Proposition~\ref{pro:yurinskii_emp_proc}
-  with the metric $d(f_w, f_{w'}) = \|w-w'\|_2$
-  and the function class
-  %
-  \begin{align*}
-    \cF
-    &=
-    \left\{
-      (W_i, \varepsilon_i) \mapsto
-      e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
-      \varepsilon_i
-      :\ w \in \cW
-    \right\},
-  \end{align*}
-  %
-  with $\psi$ chosen as a suitable Bernstein Orlicz function.
-
-  \proofparagraph{bounding $H(w)^{-1}$}
-
-  Recall that
-  $H(w) = \sum_{i=1}^n \E[K_h(W_i-w) p_h(W_i-w)p_h(W_i-w)^\T]$
-  and let $a(w) \in \R^k$ with $\|a(w)\|_2 = 1$.
-  Since the density of $W_i$ is bounded away from zero on $\cW$,
-  %
-  \begin{align*}
-    a(w)^\T H(w) a(w)
-    &=
-    n \E\left[
-      \big( a(w)^\T p_h(W_i-w) \big)^2
-      K_h(W_i-w)
-    \right] \\
-    &\gtrsim
-    n \int_\cW
-    \big( a(w)^\T p_h(u-w) \big)^2
-    K_h(u-w)
-    \diff{u}
-    \gtrsim
-    n \int_{\frac{\cW-w}{h}}
-    \big( a(w)^\T p(u) \big)^2
-    K(u)
-    \diff{u}.
-  \end{align*}
-  %
-  This is continuous in $a(w)$ on the compact set
-  $\|a(w)\|_2 = 1$
-  and $p(u)$ forms a polynomial basis so
-  $a(w)^\T p(u)$ has finitely many zeroes.
-  Since $K(u)$ is compactly supported
-  and $h \to 0$,
-  the above integral is eventually strictly positive
-  for all $x \in \cW$,
-  and hence is bounded below uniformly in $w \in \cW$
-  by a positive constant.
-  Therefore
-  $\sup_{w \in \cW} \|H(w)^{-1}\|_2 \lesssim 1/n$.
-
-  \proofparagraph{bounding $\beta_\delta$}
-
-  Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$
-  with cardinality $|\cF_\delta| \asymp \delta^{-m}$
-  and let
-  $\cF_\delta(W_i, \varepsilon_i)
-  = \big(f(W_i, \varepsilon_i) : f\in \cF_\delta\big)$.
-  Define the truncated errors
-  $\tilde\varepsilon_i =
-  \varepsilon_i\I\{-a \log n \leq \varepsilon_i \leq b \log n\}$
-  and note that
-  $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$
-  implies that
-  $\P(\exists i: \tilde\varepsilon_i \neq \varepsilon_i)
-  \lesssim n^{1-(a \vee b)/C_\varepsilon}$.
-  Hence, by choosing $a$ and $b$ large enough,
-  with high probability, we can replace all
-  $\varepsilon_i$ by $\tilde\varepsilon_i$.
-  Further, it is always possible to increase either $a$ or $b$
-  along with some randomization to ensure that
-  $\E[\tilde\varepsilon_i] = 0$.
-  Since $K$ is bounded and compactly supported,
-  $W_i$ has a bounded density and
-  $|\tilde\varepsilon_i| \lesssim \log n$,
-  %
-  \begin{align*}
-    \bigvvvert
-    f(W_i, \tilde\varepsilon_i)
-    \bigvvvert_2
-    &=
-    \E\left[
-      \left|
-      e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
-      \tilde\varepsilon_i
-      \right|^2
-    \right]^{1/2} \\
-    &\leq
-    \E\left[
-      \|H(w)^{-1}\|_2^2
-      K_h(W_i-w)^2
-      \|p_h(W_i-w)\|_2^2
-      \sigma^2(W_i)
-    \right]^{1/2} \\
-    &\lesssim
-    n^{-1}
-    \E\left[
-      K_h(W_i-w)^2
-    \right]^{1/2}
-    \lesssim
-    n^{-1}
-    h^{-m / 2}, \\
-    \bigvvvert
-    f(W_i, \tilde\varepsilon_i)
-    \bigvvvert_\infty
-    &\leq
-    \bigvvvert
-    \|H(w)^{-1}\|_2
-    K_h(W_i-w)
-    \|p_h(W_i-w)\|_2
-    |\tilde\varepsilon_i|
-    \bigvvvert_\infty \\
-    &\lesssim
-    n^{-1}
-    \bigvvvert
-    K_h(W_i-w)
-    \bigvvvert_\infty
-    \log n
-    \lesssim
-    n^{-1}
-    h^{-m}
-    \log n.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \E\left[
-      \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2
-      \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty
-    \right]
-    &\leq
-    \!\sum_{f\in\cF_\delta}
-    \!\bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2^2
-    \max_{f\in\cF_\delta}
-    \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty
-    \!\lesssim
-    n^{-3} \delta^{-m} h^{-2m} \log n.
-  \end{align*}
-  %
-  Let
-  $V_i(\cF_\delta) =
-  \E\big[\cF_\delta(W_i, \tilde\varepsilon_i)
-    \cF_\delta(W_i, \tilde\varepsilon_i)^\T
-  \mid \cH_{i-1}\big]$
-  and $Z_i \sim \cN(0, I_d)$ be i.i.d.\ and
-  independent of $\cH_n$.
-  Note that
-  $V_i(f,f) = \E[f(W_i, \tilde\varepsilon_i)^2 \mid W_i]
-  \lesssim n^{-2} h^{-2m}$
-  and
-  $\E[V_i(f,f)] = \E[f(W_i, \tilde\varepsilon_i)^2]
-  \lesssim n^{-2} h^{-m}$.
-  Thus by Lemma~\ref{lem:yurinskii_app_gaussian_useful},
-  %
-  \begin{align*}
-    \E\left[
-      \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
-      \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
-    \right]
-    &=
-    \E\left[
-      \E\left[
-        \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
-        \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
-        \mid \cH_n
-      \right]
-    \right] \\
-    &\leq
-    4 \sqrt{\log 2|\cF_\delta|}
-    \,\E\Bigg[
-      \max_{f \in \cF_\delta} \sqrt{V_i(f,f)}
-      \sum_{f \in \cF_\delta} V_i(f,f)
-    \Bigg] \\
-    &\lesssim
-    n^{-3}
-    h^{-2m}
-    \delta^{-m}
-    \sqrt{\log(1/\delta)}.
-  \end{align*}
-  %
-  Thus since $\log(1/\delta) \asymp \log(1/h) \asymp\log n$,
-  %
-  \begin{align*}
-    \beta_\delta
-    &=
-    \sum_{i=1}^n
-    \E\left[
-      \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2
-      \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty
-      + \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
-      \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
-    \right]
-    \lesssim
-    \frac{\log n}
-    {n^2 h^{2m} \delta^m}.
-  \end{align*}
-
-  \proofparagraph{bounding $\Omega_\delta$}
-
-  Let $C_K>0$ be the radius of a $\ell^2$-ball
-  containing the support of $K$
-  and note that
-  %
-  \begin{align*}
-    \left|
-    V_i(f,f')
-    \right|
-    &=
-    \Big|
-    \E\Big[
-      e_1^\T H(w)^{-1}
-      p_h(W_i-w)
-      e_1^\T H(w')^{-1}
-      p_h(W_i-w') \\
-      &\qquad\times
-      K_h(W_i-w)
-      K_h(W_i-w')
-      \tilde\varepsilon_i^2
-      \Bigm| \cH_{i-1}
-    \Big]
-    \Big| \\
-    &\lesssim
-    n^{-2}
-    K_h(W_i-w)
-    K_h(W_i-w') \\
-    &\lesssim
-    n^{-2}
-    h^{-m}
-    K_h(W_i-w)
-    \I\{\|w-w'\|_2 \leq 2 C_K h\}.
-  \end{align*}
-  %
-  Since $W_i$ are $\alpha$-mixing
-  with $\alpha(j) < e^{-2j / C_\alpha}$,
-  Lemma~\ref{lem:yurinskii_app_variance_mixing}%
-  \ref{it:yurinskii_app_variance_mixing_exponential}
-  with $r=3$ gives
-  %
-  \begin{align*}
-    &\Var\left[
-      \sum_{i=1}^n V_i(f,f')
-    \right] \\
-    &\quad\lesssim
-    \sum_{i=1}^n
-    \E\left[
-      |V_i(f,f')|^3
-    \right] ^{2/3}
-    \lesssim
-    n^{-3} h^{-2m}
-    \E\left[
-      K_h(W_i-w)^3
-    \right] ^{2/3}
-    \I\{\|w-w'\|_2 \leq 2 C_K h\} \\
-    &\quad\lesssim
-    n^{-3} h^{-2m}
-    (h^{-2m})^{2/3}
-    \I\{\|w-w'\|_2 \leq 2 C_K h\} \\
-    &\quad\lesssim
-    n^{-3} h^{-10m/3}
-    \I\{\|w-w'\|_2 \leq 2 C_K h\}.
-  \end{align*}
-  %
-  Therefore, by Jensen's inequality,
-  %
-  \begin{align*}
-    \E\big[ \|\Omega_\delta\|_2 \big]
-    &\leq
-    \E\big[ \|\Omega_\delta\|_\rF \big]
-    \leq
-    \E\Bigg[
-      \sum_{f,f' \in \cF_\delta}
-      (\Omega_\delta)_{f,f'}^2
-    \Bigg]^{1/2}
-    \leq
-    \Bigg(
-      \sum_{f,f' \in \cF_\delta}
-      \Var\left[
-        \sum_{i=1}^n V_i(f,f')
-      \right]
-    \Bigg)^{1/2} \\
-    &\lesssim
-    n^{-3/2} h^{-5m/3}
-    \Bigg(
-      \sum_{f,f' \in \cF_\delta}
-      \I\{\|w-w'\|_2 \leq 2 C_K h\}
-    \Bigg)^{1/2} \\
-    &\lesssim
-    n^{-3/2} h^{-5m/3}
-    \big(h^{m} \delta^{-2m} \big)^{1/2}
-    \lesssim
-    n^{-3/2}
-    h^{-7m/6}
-    \delta^{-m}.
-  \end{align*}
-  %
-  Note that we could have used
-  $\|\cdot\|_1$ rather than $\|\cdot\|_\rF$,
-  but this term is negligible either way.
-
-  \proofparagraph{regularity of the stochastic processes}
-
-  For each $f, f' \in \cF$,
-  define the mean-zero and $\alpha$-mixing random variables
-  %
-  \begin{align*}
-    u_i(f,f')
-    &=
-    e_1^\T
-    \big(
-      H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
-      - H(w')^{-1} K_h(W_i-w') p_h(W_i-w')
-    \big)
-    \tilde\varepsilon_i.
-  \end{align*}
-  %
-  Note that for all $1 \leq j \leq k$,
-  by the Lipschitz property of the kernel and monomials,
-  %
-  \begin{align*}
-    &\left|
-    K_h(W_i-w) - K_h(W_i-w')
-    \right| \\
-    &\quad\lesssim
-    h^{-m-1}
-    \|w-w'\|_2
-    \big(
-      \I\{\|W_i-w\| \leq C_K h\}
-      + \I\{\|W_i-w'\| \leq C_K h\}
-    \big), \\
-    &\left|
-    p_h(W_i-w)_j - p_h(W_i-w')_j
-    \right|
-    \lesssim
-    h^{-1}
-    \|w-w'\|_2,
-  \end{align*}
-  %
-  to deduce that for any $1 \leq j,l \leq k$,
-  %
-  \begin{align*}
-    \big| H(w)_{j l} - H(w')_{j l} \big|
-    &=
-    \big|
-    n \E\big[
-      K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \\
-      &\qquad-
-      K_h(W_i-w') p_h(W_i-w')_j p_h(W_i-w')_l
-    \big]
-    \big| \\
-    &\leq
-    n\E\left[
-      \left|
-      K_h(W_i-w) - K_h(W_i-w')
-      \right|
-      \left|
-      p_h(W_i-w)_j
-      p_h(W_i-w)_l
-      \right|
-    \right] \\
-    &\quad+
-    n\E\left[
-      \left|
-      p_h(W_i-w)_j - p_h(W_i-w')_j
-      \right|
-      \left|
-      K_h(W_i-w')
-      p_h(W_i-w)_l
-      \right|
-    \right] \\
-    &\quad+
-    n\E\left[
-      \left|
-      p_h(W_i-w)_l - p_h(W_i-w')_l
-      \right|
-      \left|
-      K_h(W_i-w')
-      p_h(W_i-w')_j
-      \right|
-    \right] \\
-    &\lesssim
-    n h^{-1}\|w-w'\|_2.
-  \end{align*}
-  %
-  Therefore, as the dimension of the matrix $H(w)$ is fixed,
-  %
-  \begin{align*}
-    \big\| H(w)^{-1} - H(w')^{-1} \big\|_2
-    &\leq
-    \big\| H(w)^{-1}\big\|_2
-    \big\| H(w')^{-1}\big\|_2
-    \big\| H(w) - H(w') \big\|_2
-    \lesssim
-    \frac{\|w-w'\|_2}{n h}.
-  \end{align*}
-  %
-  Hence
-  %
-  \begin{align*}
-    \big| u_i(f,f') \big|
-    &\leq
-    \big\|
-    H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
-    - H(w')^{-1} K_h(W_i-w') p_h(W_i-w')
-    \tilde\varepsilon_i
-    \big\|_2 \\
-    &\leq
-    \big\| H(w)^{-1} - H(w')^{-1} \big\|_2
-    \big\| K_h(W_i-w) p_h(W_i-w)
-    \tilde\varepsilon_i
-    \big\|_2 \\
-    &\quad+
-    \big| K_h(W_i-w) - K_h(W_i-w') \big|
-    \big\| H(w')^{-1} p_h(W_i-w)
-    \tilde\varepsilon_i
-    \big\|_2 \\
-    &\quad+
-    \big\| p_h(W_i-w) - p_h(W_i-w') \big\|_2
-    \big\| H(w')^{-1} K_h(W_i-w')
-    \tilde\varepsilon_i \big\|_2 \\
-    &\lesssim
-    \frac{\|w-w'\|_2}{n h}
-    \big| K_h(W_i-w) \tilde\varepsilon_i \big|
-    + \frac{1}{n}
-    \big| K_h(W_i-w) - K_h(W_i-w') \big|
-    \,|\tilde\varepsilon_i| \\
-    &\lesssim
-    \frac{\|w-w'\|_2 \log n}{n h^{m+1}},
-  \end{align*}
-  %
-  and from the penultimate line, we also deduce that
-  %
-  \begin{align*}
-    \Var[u_i(f,f')]
-    &\lesssim
-    \frac{\|w-w'\|_2^2}{n^2h^2}
-    \E\left[
-      K_h(W_i-w)^2 \sigma^2(X_i)
-    \right] \\
-    &\quad+
-    \frac{1}{n^2}
-    \E\left[
-      \big( K_h(W_i-w) - K_h(W_i-w') \big)^2
-      \sigma^2(X_i)
-    \right]
-    \lesssim
-    \frac{\|w-w'\|_2^2}{n^2h^{m+2}}.
-  \end{align*}
-  %
-  Further, $\E[u_i(f,f') u_j(f,f')] = 0$ for $i \neq j$ so
-  by Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
-  \ref{it:yurinskii_app_exponential_mixing_bernstein},
-  for a constant $C_1>0$,
-  %
-  \begin{align*}
-    \P\left(
-      \Big| \sum_{i=1}^n u_i(f,f') \Big|
-      \geq \frac{C_1 \|w-w'\|_2}{\sqrt n h^{m/2+1}}
-      \left(
-        \sqrt{t}
-        + \sqrt{\frac{(\log n)^2}{n h^m}} \sqrt t
-        + \sqrt{\frac{(\log n)^6}{n h^m}} t
-      \right)
-    \right)
-    &\leq
-    C_1 e^{-t}.
-  \end{align*}
-  %
-  Therefore, adjusting the constant if necessary
-  and since $n h^{m} \gtrsim (\log n)^7$,
-  %
-  \begin{align*}
-    \P\left(
-      \Big| \sum_{i=1}^n u_i(f,f') \Big|
-      \geq
-      \frac{C_1 \|w-w'\|_2}{\sqrt{n} h^{m/2+1}}
-      \left(
-        \sqrt{t} + \frac{t}{\sqrt{\log n}}
-      \right)
-    \right)
-    &\leq
-    C_1 e^{-t}.
-  \end{align*}
-  %
-  \Citet[Lemma~2]{van2013bernstein} with
-  $\psi(x) =
-  \exp\Big(\big(\sqrt{1+2 x / \sqrt{\log n}}-1 \big)^2
-  \log n \Big)-1$
-  now shows that
-  %
-  \begin{align*}
-    \Bigvvvert \sum_{i=1}^n u_i(f,f') \Bigvvvert_\psi
-    &\lesssim
-    \frac{\|w-w'\|_2}{\sqrt{n} h^{m/2+1}}
-  \end{align*}
-  %
-  so we take $L = \frac{1}{\sqrt{n} h^{m/2+1}}$.
-  Noting
-  $\psi^{-1}(t) = \sqrt{\log(1+t)} + \frac{\log(1+t)}{2\sqrt{\log n}}$
-  and $N_\delta \lesssim \delta^{-m}$,
-  %
-  \begin{align*}
-    J_\psi(\delta)
-    &=
-    \int_0^\delta
-    \psi^{-1}\big( N_\varepsilon \big)
-    \diff{\varepsilon}
-    + \delta
-    \psi^{-1} \big( N_\delta \big)
-    \lesssim
-    \frac{\delta \log(1/\delta)}{\sqrt{\log n}}
-    + \delta \sqrt{\log(1/\delta)}
-    \lesssim
-    \delta \sqrt{\log n}, \\
-    J_2(\delta)
-    &=
-    \int_0^\delta
-    \sqrt{\log N_\varepsilon}
-    \diff{\varepsilon}
-    \lesssim
-    \delta \sqrt{\log(1/\delta)}
-    \lesssim
-    \delta \sqrt{\log n}.
-  \end{align*}
-
-  \proofparagraph{strong approximation}
-
-  Recalling that
-  $\tilde\varepsilon_i = \varepsilon_i$
-  for all $i$ with high probability,
-  by Proposition~\ref{pro:yurinskii_emp_proc},
-  for all $t, \eta > 0$ there exists a
-  zero-mean Gaussian process $T(w)$ satisfying
-  %
-  \begin{align*}
-    \E\left[
-      \left(\sum_{i=1}^n f_w(W_i, \varepsilon_i)\right)
-      \left(\sum_{i=1}^n f_{w'}(W_i, \varepsilon_i)\right)
-    \right]
-    &= \E\big[ T(w) T(w')
-    \big]
-  \end{align*}
-  %
-  for all $w, w' \in \cW$ and
-  %
-  \begin{align*}
-    &\P\left(
-      \sup_{w \in \cW}
-      \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
-      - T(w) \right|
-      \geq C_\psi(t + \eta)
-    \right) \\
-    &\quad\leq
-    C_\psi
-    \inf_{\delta > 0}
-    \inf_{\cF_\delta}
-    \Bigg\{
-      \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta }
-      + \left(\frac{\sqrt{\log 2 |\cF_\delta|}
-      \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} \\
-      &\qquad+
-      \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1}
-      + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right)
-    \Bigg\} \\
-    &\quad\leq
-    C_\psi
-    \Bigg\{
-      \frac{
-        \left(\frac{\log n} {n^2 h^{2m} \delta^{m}} \right)^{1/3}
-      (\log n)^{1/3}}{\eta }
-      + \left(\frac{\sqrt{\log n}
-          \sqrt{n^{-3/2} h^{-7m/6} \delta^{-m}}
-      }{\eta }\right)^{2/3} \\
-      &\qquad+
-      \psi\left(\frac{t}{\frac{1}{\sqrt{n} h^{m/2+1}}
-      J_\psi(\delta)}\right)^{-1}
-      + \exp\left(\frac{-t^2}{
-          \left( \frac{1}{\sqrt{n} h^{m/2+1}} \right)^2
-      J_2(\delta)^2}\right)
-    \Bigg\} \\
-    &\quad\leq
-    C_\psi
-    \Bigg\{
-      \frac{
-      (\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3} \eta}
-      + \left(\frac{
-        n^{-3/4} h^{-7m/12} \delta^{-m/2} \sqrt{\log n}}
-      {\eta }\right)^{2/3} \\
-      &\qquad+
-      \psi\left(\frac{t\sqrt{n} h^{m/2+1}}
-      {\delta \sqrt{\log n}}\right)^{-1}
-      + \exp\left(\frac{-t^2n h^{m+2}}
-      {\delta^2 \log n}\right)
-    \Bigg\}.
-  \end{align*}
-  %
-  Noting $\psi(x) \geq e^{x^2/4}$ for $x \leq 4 \sqrt{\log n}$,
-  any $R_n \to \infty$ gives the probability bound
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
-    - T(w) \right|
-    &\lesssim_\P
-    \frac{(\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3}} R_n
-    + \frac{\sqrt{\log n}}{n^{3/4} h^{7m/12} \delta^{m/2}} R_n
-    + \frac{\delta \sqrt{\log n}} {\sqrt{n} h^{m/2+1}}.
-  \end{align*}
-  %
-  Optimizing over $\delta$ gives
-  $\delta \asymp \left(\frac{\log n}{n h^{m-6}}\right)^{\frac{1}{2m+6}}
-  = h \left( \frac{\log n}{n h^{3m}} \right)^{\frac{1}{2m+6}}$
-  and so
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
-    - T(w) \right|
-    &\lesssim_\P
-    \left(
-      \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}}
-    \right)^{\frac{1}{2m+6}} R_n.
-  \end{align*}
-
-  \proofparagraph{convergence of $\hat H(w)$}
-
-  For $1 \leq j,l \leq k$
-  define the zero-mean random variables
-  %
-  \begin{align*}
-    u_{i j l}(w)
-    &=
-    K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l
-    - \E\big[K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \big]
-  \end{align*}
-  %
-  and note that
-  $|u_{i j l}(w)| \lesssim h^{-m}$.
-  By Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
-  \ref{it:yurinskii_app_exponential_mixing_bounded}
-  for a constant $C_2 > 0$ and all $t > 0$,
-  %
-  \begin{align*}
-    \P\left(
-      \left|
-      \sum_{i=1}^n
-      u_{i j l}(w)
-      \right|
-      > C_2 h^{-m} \big( \sqrt{n t}
-      + (\log n)(\log \log n) t \big)
-    \right)
-    &\leq
-    C_2 e^{-t}.
-  \end{align*}
-  %
-  Further, note that by Lipschitz properties,
-  %
-  \begin{align*}
-    \left|
-    \sum_{i=1}^n u_{i j l}(w)
-    - \sum_{i=1}^n u_{i j l}(w')
-    \right|
-    &\lesssim
-    h^{-m-1} \|w-w'\|_2
-  \end{align*}
-  %
-  so there is a $\delta$-cover of $(\cW, \|\cdot\|_2)$
-  with size at most $n^a \delta^{-a}$ for some $a > 0$.
-  Adjusting $C_2$,
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \left|
-      \sum_{i=1}^n
-      u_{i j l}(w)
-      \right|
-      > C_2 h^{-m} \big( \sqrt{n t}
-      + (\log n)(\log \log n) t \big)
-      + C_2 h^{-m-1} \delta
-    \right)
-    &\leq
-    C_2 n^a \delta^{-a}
-    e^{-t}
-  \end{align*}
-  %
-  and hence
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    \sum_{i=1}^n
-    u_{i j l}(w)
-    \right|
-    &\lesssim_\P
-    h^{-m} \sqrt{n \log n}
-    + h^{-m} (\log n)^3
-    \lesssim_\P
-    \sqrt{\frac{n \log n}{h^{2m}}}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \sup_{w\in\cW} \|\hat H(w)-H(w)\|_2
-    &\lesssim_\P
-    \sqrt{\frac{n \log n}{h^{2m}}}.
-  \end{align*}
-
-  \proofparagraph{bounding the matrix term}
-
-  Firstly, note that
-  since $\sqrt{\frac{\log n}{n h^{2m}}} \to 0$,
-  we have that uniformly in $w \in \cW$
-  %
-  \begin{align*}
-    \|\hat H(w)^{-1}\|_2
-    \leq
-    \frac{\|H(w)^{-1}\|_2}
-    {1 - \|\hat H(w)-H(w)\|_2 \|H(w)^{-1}\|_2}
-    &\lesssim_\P
-    \frac{1/n}
-    {1 - \sqrt{\frac{n \log n}{h^{2m}}} \frac{1}{n}}
-    \lesssim_\P
-    \frac{1}{n}.
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    &\sup_{w \in \cW}
-    \big|
-    e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big)
-    S(w)
-    \big|
-    \leq
-    \sup_{w \in \cW}
-    \big\|\hat H(w)^{-1} - H(w)^{-1}\big\|_2
-    \|S(w)\|_2 \\
-    &\quad\leq
-    \sup_{w \in \cW}
-    \big\|\hat H(w)^{-1}\big\|_2
-    \big\|H(w)^{-1}\big\|_2
-    \big\|\hat H(w) - H(w)\big\|_2
-    \|S(w)\|_2
-    \lesssim_\P
-    \sqrt{\frac{\log n}{n^3 h^{2m}}}
-    \sup_{w \in \cW}
-    \|S(w)\|_2.
-  \end{align*}
-  %
-  Now for $1 \leq j \leq k$ write
-  $u_{i j}(w) = K_h(W_i-w) p_h(W_i-w)_j \tilde \varepsilon_i$
-  so that $S(w)_j = \sum_{i=1}^n u_{i j}(w)$ with high probability.
-  Note that $u_{i j}(w)$ are zero-mean with
-  $\Cov[u_{i j}(w), u_{i' j}(w)] = 0$ for $ i \neq i'$.
-  Also $|u_{i j}(w)| \lesssim h^{-m} \log n$
-  and $\Var[u_{i j}(w)] \lesssim h^{-m}$.
-  By Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
-  \ref{it:yurinskii_app_exponential_mixing_bernstein}
-  for a constant $C_3>0$,
-  %
-  \begin{align*}
-    \P\left(
-      \Big| \sum_{i=1}^n u_{i j}(w) \Big|
-      \geq C_3 \big( (h^{-m/2} \sqrt n + h^{-m} \log n) \sqrt t
-      + h^{-m} (\log n)^3 t \big)
-    \right)
-    &\leq
-    C_3 e^{-t}, \\
-    \P\left(
-      \Big| \sum_{i=1}^n u_{i j}(w) \Big|
-      >
-      C_3 \left(
-        \sqrt{\frac{tn}{h^{m}}}
-        + \frac{t(\log n)^3}{h^{m}}
-      \right)
-    \right)
-    &\leq
-    C_3 e^{-t},
-  \end{align*}
-  %
-  where we used $n h^{m} \gtrsim (\log n)^2$
-  and adjusted the constant if necessary.
-  As before,
-  $u_{i j}(w)$ is Lipschitz in $w$ with a constant which is at most
-  polynomial in $n$,
-  so for some $a>0$
-  %
-  \begin{align*}
-    \P\left(
-      \sup_{w \in \cW}
-      \Big| \sum_{i=1}^n u_{i j}(w) \Big|
-      >
-      C_3 \left(
-        \sqrt{\frac{tn}{h^{m}}}
-        + \frac{t(\log n)^3}{h^{m}}
-      \right)
-    \right)
-    &\leq
-    C_3 n^a e^{-t}, \\
-    \sup_{w \in \cW}
-    \|S(w)\|_2
-    \lesssim_\P
-    \sqrt{\frac{n \log n}{h^{m}}}
-    + \frac{(\log n)^4}{h^{m}}
-    &\lesssim_\P
-    \sqrt{\frac{n \log n}{h^{m}}}
-  \end{align*}
-  %
-  as $n h^m \gtrsim (\log n)^7$.
-  Finally,
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \big|
-    e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big)
-    S(w)
-    \big|
-    &\lesssim_\P
-    \sqrt{\frac{\log n}{n^3 h^{2m}}}
-    \sqrt{\frac{n \log n}{h^{m}}}
-    \lesssim_\P
-    \frac{\log n}{\sqrt{n^2 h^{3m}}}.
-  \end{align*}
-
-  \proofparagraph{bounding the bias}
-
-  Since $\mu \in \cC^\gamma$, we have, by the multivariate version of Taylor's
-  theorem,
-  %
-  \begin{align*}
-    \mu(W_i)
-    &=
-    \sum_{|\kappa|=0}^{\gamma-1}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w)
-    (W_i-w)^\kappa
-    + \sum_{|\kappa|=\gamma}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w')
-    (W_i-w)^\kappa
-  \end{align*}
-  %
-  for some $w'$ on the line segment connecting
-  $w$ and $W_i$.
-  Now since $p_h(W_i-w)_1 = 1$,
-  %
-  \begin{align*}
-    &e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(w) \\
-    &\quad=
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T e_1 \mu(w)
-    = e_1^\T e_1 \mu(w) = \mu(w).
-  \end{align*}
-  %
-  Therefore
-  %
-  \begin{align*}
-    \Bias(w)
-    &=
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i)
-    - \mu(w) \\
-    &=
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \\
-    &\quad\times
-    \Bigg(
-      \sum_{|\kappa|=0}^{\gamma-1}
-      \frac{1}{\kappa!}
-      \partial^{\kappa} \mu(w)
-      (W_i-w)^\kappa
-      + \sum_{|\kappa|=\gamma}
-      \frac{1}{\kappa!}
-      \partial^{\kappa} \mu(w')
-      (W_i-w)^\kappa
-      - \mu(w)
-    \Bigg) \\
-    &=
-    \sum_{|\kappa|=1}^{\gamma-1}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w)
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    (W_i-w)^\kappa \\
-    &\quad+
-    \sum_{|\kappa|=\gamma}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w')
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    (W_i-w)^\kappa \\
-    &=
-    \sum_{|\kappa|=\gamma}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w')
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    (W_i-w)^\kappa,
-  \end{align*}
-  %
-  where we used that
-  $p_h(W_i-w)$ is a vector containing monomials
-  in $W_i-w$ of order up to $\gamma$, so
-  $e_1^\T \hat H(w)^{-1}
-  \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-  (W_i-w)^\kappa = 0$
-  whenever $1 \leq |\kappa| \leq \gamma$.
-  Finally,
-  %
-  \begin{align*}
-    \sup_{w\in\cW}
-    |\Bias(w)|
-    &=
-    \sup_{w\in\cW}
-    \Bigg|
-    \sum_{|\kappa|=\gamma}
-    \frac{1}{\kappa!}
-    \partial^{\kappa} \mu(w')
-    e_1^\T \hat H(w)^{-1}
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    (W_i-w)^\kappa
-    \Bigg| \\
-    &\lesssim_\P
-    \sup_{w\in\cW}
-    \max_{|\kappa| = \gamma}
-    \left|
-    \partial^{\kappa} \mu(w')
-    \right|
-    \|\hat H(w)^{-1}\|_2
-    \Bigg\|
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    \Bigg\|_2
-    h^\gamma \\
-    &\lesssim_\P
-    \frac{h^\gamma}{n}
-    \sup_{w\in\cW}
-    \Bigg\|
-    \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
-    \Bigg\|_2.
-  \end{align*}
-  %
-  Write
-  $\tilde u_{i j}(w) = K_h(W_i-w)p_h(W_i-w)_j$
-  and note $|\tilde u_{i j}(w)| \lesssim h^{-m}$
-  and $\E[\tilde u_{i j}(w)] \lesssim 1$, so
-  %
-  \begin{align*}
-    \P\left(
-      \left|
-      \sum_{i=1}^n \tilde u_{i j}(w)
-      - \E\left[
-        \sum_{i=1}^n \tilde u_{i j}(w)
-      \right]
-      \right|
-      > C_4 h^{-m} \big( \sqrt{n t}
-      + (\log n)(\log \log n) t \big)
-    \right)
-    &\leq
-    C_4 e^{-t}
-  \end{align*}
-  %
-  by Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
-  \ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_4$,
-  By Lipschitz properties, this implies
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|
-    \sum_{i=1}^n \tilde u_{i j}(w)
-    \right|
-    &\lesssim_\P
-    n
-    \left(
-      1 + \sqrt{\frac{\log n}{n h^{2m}}}
-    \right)
-    \lesssim_\P
-    n.
-  \end{align*}
-  %
-  Therefore
-  $\sup_{w\in\cW} |\Bias(w)|
-  \lesssim_\P n h^\gamma / n
-  \lesssim_\P h^\gamma$.
-
-  \proofparagraph{conclusion}
-
-  By the previous parts,
-  %
-  \begin{align*}
-    \sup_{w \in \cW}
-    \left|\hat \mu(w) - \mu(w) - T(w) \right|
-    &\leq
-    \sup_{w \in \cW}
-    \left|e_1^\T H(w)^{-1} S(w) - T(w) \right| \\
-    &\quad+
-    \sup_{w \in \cW}
-    \left| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \right|
-    + \sup_{w \in \cW}
-    |\Bias(w)| \\
-    &\lesssim_\P
-    \left(
-      \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}}
-    \right)^{\frac{1}{2m+6}} R_n
-    + \frac{\log n}{\sqrt{n^2 h^{3m}}}
-    + h^\gamma \\
-    &\lesssim_\P
-    \frac{R_n}{\sqrt{n h^m}}
-    \left(
-      \frac{(\log n)^{m+4}}{n h^{3m}}
-    \right)^{\frac{1}{2m+6}}
-    + h^\gamma,
-  \end{align*}
-  %
-  where the last inequality follows because
-  $n h^{3m} \to \infty$
-  and $\frac{1}{2m+6} \leq \frac{1}{2}$.
-  Finally, we verify the upper and lower bounds
-  on the variance of the Gaussian process.
-  Since the spectrum of $H(w)^{-1}$
-  is bounded above and below by $1/n$,
-  %
-  \begin{align*}
-    \Var[T(w)]
-    &=
-    \Var\left[
-      e_1^\T H(w)^{-1}
-      \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i
-    \right] \\
-    &=
-    e_1^\T H(w)^{-1}
-    \Var\left[
-      \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i
-    \right]
-    H(w)^{-1} e_1^\T \\
-    &\lesssim
-    \|H(w)^{-1}\|_2^2
-    \max_{1 \leq j \leq k}
-    \sum_{i=1}^n
-    \Var\big[
-      K_h(W_i-w) p_h(W_i-w)_j \sigma(W_i)
-    \big] \\
-    &\lesssim
-    \frac{1}{n^2} n
-    \frac{1}{h^m}
-    \lesssim
-    \frac{1}{n h^m}.
-  \end{align*}
-  %
-  Similarly,
-  $\Var[T(w)] \gtrsim \frac{1}{n h^m}$
-  by the same argument used to bound eigenvalues of
-  $H(w)^{-1}$.
-  %
-\end{proof}
-
-\section{High-dimensional central limit theorems for martingales}%
-\label{sec:yurinskii_app_high_dim_clt}
-
-We present an application of our main results to
-high-dimensional central limit theorems for martingales. Our main
-contribution here is the generality of our results, which are broadly
-applicable to martingale data and impose minimal extra assumptions. In exchange
-for the scope and breadth of our results, we naturally do not necessarily
-achieve state-of-the-art distributional approximation errors in certain special
-cases, such as with independent data or when restricting the class of sets over
-which the central limit theorem must hold. Extensions of our high-dimensional
-central limit theorem results to mixingales and other approximate martingales,
-along with third-order refinements and Gaussian mixture target distributions,
-are possible through methods akin to those used to establish our main results
-in Section~\ref{sec:yurinskii_main_results}, but we omit these for succinctness.
-
-Our approach to deriving a high-dimensional martingale central limit theorem
-proceeds as follows. Firstly, the upcoming
-Proposition~\ref{pro:yurinskii_app_clt} uses our
-main result on martingale coupling
-(Corollary~\ref{cor:yurinskii_sa_martingale}) to
-reduce the problem to that of providing anti-concentration results for
-high-dimensional Gaussian vectors. We then demonstrate the utility of this
-reduction by employing a few such anti-concentration methods from the existing
-literature. Proposition~\ref{pro:yurinskii_app_bootstrap} gives a feasible
-implementation via
-the Gaussian multiplier bootstrap, enabling valid
-resampling-based inference using
-the resulting conditional Gaussian distribution. Finally, in
-Section~\ref{sec:yurinskii_app_lp} we provide an example application:
-distributional
-approximation for $\ell^p$-norms of high-dimensional martingale vectors
-in Kolmogorov--Smirnov distance, relying on some recent results
-concerning Gaussian perimetric inequalities
-\citep{nazarov2003maximal,kozbur2021dimension,
-giessing2023anti,chernozhukov2017detailed}.
-
-We begin this section with some notation. Assume the setup of
-Corollary~\ref{cor:yurinskii_sa_martingale} and suppose $\Sigma$ is
-non-random. Let $\cA$ be a class of measurable subsets of
-$\R^d$ and take $T \sim \cN(0, \Sigma)$.
-For $\eta>0$ and $p \in [1, \infty]$ define the Gaussian perimetric quantity
-%
-\begin{align*}
-  \Delta_p(\cA, \eta)
-  &=
-  \sup_{A\in \cA}
-  \big\{\P(T\in A_p^\eta\setminus A)
-  \vee \P(T\in A \setminus A_p^{-\eta})\big\},
-\end{align*}
-%
-where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$,
-$A_p^{-\eta} = \R^d \setminus (\R^d \setminus A)_p^\eta$,
-and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$.
-Using this perimetric term allows us to convert coupling results
-to central limit theorems as follows.
-Denote by $\Gamma_p(\eta)$ the rate of strong approximation attained in
-Corollary~\ref{cor:yurinskii_sa_martingale}:
-%
-\begin{align*}
-  \Gamma_p(\eta)
-  &=
-  24 \left(
-    \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
-  \right)^{1/3}
-  + 17 \left(
-    \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
-  \right)^{1/3}.
-\end{align*}
-
-\begin{proposition}[High-dimensional central limit theorem for martingales]%
-  \label{pro:yurinskii_app_clt}
-
-  Take the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
-  and $\Sigma$ non-random.
-  For a class $\cA$ of measurable sets in $\R^d$,
-  %
-  \begin{equation}%
-    \label{eq:yurinskii_app_high_dim_clt}
-    \sup_{A\in \cA}
-    \big|\P(S\in A) -\P(T\in A)\big|
-    \leq \inf_{p \in [1, \infty]} \inf_{\eta>0}
-    \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}.
-  \end{equation}
-\end{proposition}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_app_clt}]
-
-  This follows from Strassen's theorem
-  (Lemma~\ref{lem:yurinskii_app_strassen}), but we
-  provide a proof for completeness.
-  %
-  \begin{align*}
-    \P(S \in A)
-    &\leq
-    \P(T \in A)
-    + \P(T \in A_p^\eta \setminus A)
-    + \P(\|S - T\| > \eta)
-  \end{align*}
-  %
-  and applying this to $\R^d \setminus A$ gives
-  %
-  \begin{align*}
-    \P(S\in A)
-    &=
-    1 - \P(S\in \R^d \setminus A) \\
-    &\geq
-    1 - \P(T \in \R^d \setminus A)
-    - \P(T \in (\R^d \setminus A)_p^\eta \setminus (\R^d \setminus A))
-    - \P(\|S - T\| > \eta) \\
-    &=
-    \P(T \in A)
-    - \P(T \in A \setminus A_p^{-\eta})
-    - \P(\|S - T\| > \eta).
-  \end{align*}
-  %
-  Since this holds for all $p \in [1, \infty]$,
-  %
-  \begin{align*}
-    \sup_{A\in \cA}
-    \big|\P(S\in A) -\P(T\in A)\big|
-    &\leq
-    \sup_{A \in \cA}
-    \big\{\P(T \in A_p^\eta\setminus A)
-    \vee \P(T \in A \setminus A_p^{-\eta})\big\}
-    + \P(\|S - T\| > \eta) \\
-    &\leq
-    \inf_{p \in [1, \infty]} \inf_{\eta>0}
-    \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}.
-  \end{align*}
-  %
-\end{proof}
-
-The term $\Delta_p(\cA, \eta)$
-in \eqref{eq:yurinskii_app_high_dim_clt} is a Gaussian anti-concentration
-quantity
-so it depends on the law of $S$ only through the covariance matrix $\Sigma$.
-A few results are available in the literature
-for bounding this term.
-For instance, with
-$\cA = \cC = \{A \subseteq \R^d \text{ is convex}\}$,
-\citet{nazarov2003maximal} showed
-%
-\begin{equation}%
-  \label{eq:yurinskii_app_convex_anticonc}
-  \Delta_2(\cC, \eta)
-  \asymp
-  \eta\sqrt{\|\Sigma^{-1}\|_{\rF}},
-\end{equation}
-%
-whenever $\Sigma$ is invertible.
-Proposition~\ref{pro:yurinskii_app_clt} with $p=2$
-and \eqref{eq:yurinskii_app_convex_anticonc} yield for convex sets
-%
-\begin{align*}
-  \sup_{A\in \cC}
-  \big|\P(S\in A) -\P(T\in A)\big|
-  &\lesssim
-  \inf_{\eta > 0}
-  \left\{
-    \left(\frac{\beta_{p,2} d}{\eta^3}\right)^{1/3}
-    + \left(\frac{\E[\|\Omega \|_2] d}{\eta^2}\right)^{1/3}
-    + \eta \sqrt{\|\Sigma^{-1}\|_\rF}
-  \right\}.
-\end{align*}
-
-Alternatively, one can take $\cA = \cR$,
-the class of axis-aligned rectangles in $\R^d$.
-By Nazarov's Gaussian perimetric inequality
-\citep{nazarov2003maximal,chernozhukov2017central},
-%
-\begin{align}%
-  \label{eq:yurinskii_app_rect_anticonc}
-  \Delta_\infty(\cR, \eta)
-  \leq \frac{\eta (\sqrt{2\log d} + 2)}{\sigma_{\min}}
-\end{align}
-%
-whenever $\min_j \, \Sigma_{j j} \geq \sigma_{\min}^2$
-for some $\sigma_{\min}>0$.
-Proposition~\ref{pro:yurinskii_app_clt} with $p = \infty$
-and \eqref{eq:yurinskii_app_rect_anticonc} yields
-%
-\begin{align*}%
-  &\sup_{A\in \cR}
-  \big|\P(S\in A) -\P(T\in A)\big|
-  \lesssim
-  \inf_{\eta > 0}
-  \left\{
-    \left(\frac{\beta_{\infty,2} \log 2d}{\eta^3}\right)^{1/3}
-    + \left(\frac{\E[\|\Omega \|_2] \log 2d}{\eta^2}\right)^{1/3}
-    + \frac{\eta \sqrt{\log 2d}}{\sigma_{\min}}
-  \right\}.
-\end{align*}
-%
-In situations where
-$\liminf_n \min_j \, \Sigma_{j j} = 0$,
-it may be possible in certain cases to regularize
-the minimum variance away from zero and then apply
-a Gaussian--Gaussian rectangular approximation result
-such as Lemma~2.1 from \citet{chernozhukov2023nearly}.
-
-\begin{remark}[Comparisons with the literature]
-
-  The literature on high-dimensional central limit theorems
-  has developed rapidly in recent years
-  \citep[see][and references therein]{%
-    zhai2018high,%
-    koike2021notes,%
-    buzun2022strong,%
-    lopes2022central,%
-    chernozhukov2023nearly%
-  },
-  particularly for the special case of
-  sums of independent random vectors
-  on the rectangular sets $\cR$.
-  %
-  Our corresponding results are rather weaker in terms of
-  dependence on the dimension than for example
-  \citet[Theorem~2.1]{chernozhukov2023nearly}.
-  This is an inherent issue due to our approach of first
-  considering the class of all Borel sets
-  and only afterwards specializing to the smaller class $\cR$,
-  where sharper results in the literature directly target the
-  Kolmogorov--Smirnov distance via Stein's method and Slepian interpolation.
-\end{remark}
-
-Next, we present a version of Proposition~\ref{pro:yurinskii_app_clt} in which
-the covariance
-matrix $\Sigma$ is replaced by an estimator $\hat \Sigma$. This ensures that
-the associated conditionally Gaussian vector is feasible and can be resampled,
-allowing Monte Carlo quantile estimation via a Gaussian
-multiplier bootstrap.
-
-\begin{proposition}[Bootstrap central limit theorem for martingales]%
-  \label{pro:yurinskii_app_bootstrap}
-
-  Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
-  with $\Sigma$ non-random,
-  and let $\hat \Sigma$ be an $\bX$-measurable random
-  $d \times d$ positive semi-definite matrix,
-  where $\bX = (X_1, \ldots, X_n)$.
-  For a class $\cA$ of measurable subsets of $\R^d$,
-  %
-  \begin{align*}
-    &\sup_{A\in \cA}
-    \left|
-    \P\big(S \in A\big)
-    - \P\big(\hat \Sigma^{1/2} Z \in A \bigm| \bX \big)
-    \right| \\
-    &\quad\leq
-    \inf_{p \in [1,\infty]} \inf_{\eta>0}
-    \left\{ \Gamma_p(\eta) + 2 \Delta_p(\cA, \eta)
-      + 2d \exp\left(\frac{-\eta^2}
-        {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}
-      \right)
-    \right\},
-  \end{align*}
-  %
-  where $Z \sim \cN(0,I_d)$ is independent of $\bX$.
-\end{proposition}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_app_bootstrap}]
-
-  Since $T = \Sigma^{1/2} Z$ is independent of $\bX$,
-  %
-  \begin{align*}
-    &\left|
-    \P\big(S \in A\big)
-    - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
-    \right| \\
-    &\quad\leq
-    \left|
-    \P\big(S \in A\big)
-    - \P\big(T \in A\big)
-    \right|
-    +\left|
-    \P\big(\Sigma^{1/2} Z \in A\big)
-    - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
-    \right|.
-  \end{align*}
-  %
-  The first term is bounded by Proposition~\ref{pro:yurinskii_app_clt};
-  the second by Lemma~\ref{lem:yurinskii_app_feasible_gaussian}
-  conditional on $\bX$.
-  %
-  \begin{align*}
-    &\left|
-    \P\big(S \in A\big)
-    - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
-    \right| \\
-    &\quad\leq
-    \Gamma_p(\eta) + \Delta_p(\cA, \eta)
-    + \Delta_{p'}(\cA, \eta')
-    + 2 d \exp \left( \frac{-\eta'^2}
-      {2 d^{2/p'} \big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}
-    \right)
-  \end{align*}
-  %
-  for all $A \in \cA$
-  and any $p, p' \in [1, \infty]$ and $\eta, \eta' > 0$.
-  Taking a supremum over $A$ and infima over
-  $p = p'$ and $\eta = \eta'$ yields the result.
-  We do not need
-  $p = p'$ and $\eta = \eta'$ in general.
-  %
-\end{proof}
-
-A natural choice for $\hat\Sigma$ in certain situations is the sample
-covariance matrix $\sum_{i=1}^n X_i X_i^\T$, or a correlation-corrected variant
-thereof. In general, whenever $\hat \Sigma$ does not depend on unknown
-quantities, one can sample from the law of $\hat T = \hat\Sigma^{1/2} Z$
-conditional on $\bX$ to approximate the distribution of $S$.
-Proposition~\ref{pro:yurinskii_app_bootstrap} verifies that this Gaussian
-multiplier
-bootstrap approach is valid whenever $\hat\Sigma$ and $\Sigma$ are sufficiently
-close. To this end, Theorem~X.1.1 in \citet{bhatia1997matrix} gives
-$\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2
-\leq \big\|\hat\Sigma - \Sigma\big\|_2^{1/2}$
-and Problem~X.5.5 in the same gives
-$\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2
-\leq \big\|\Sigma^{-1/2}\big\|_2 \big\|\hat\Sigma - \Sigma\big\|_2$
-when $\Sigma$ is invertible. The latter often gives a tighter bound when the
-minimum eigenvalue of $\Sigma$ can be bounded away from zero, and consistency
-of $\hat \Sigma$ can be established using a range of matrix concentration
-inequalities.
-
-In Section~\ref{sec:yurinskii_app_lp} we apply
-Proposition~\ref{pro:yurinskii_app_clt} to the special case
-of approximating the distribution of the $\ell^p$-norm of a high-dimensional
-martingale. Proposition~\ref{pro:yurinskii_app_bootstrap} is then used to
-ensure that
-feasible distributional approximations are also available.
-
-\subsection{Application: distributional approximation of martingale
-\texorpdfstring{$\ell^p$}{lp}-norms}
-\label{sec:yurinskii_app_lp}
-
-In empirical applications,
-including nonparametric significance tests
-\citep{lopes2020bootstrapping}
-and nearest neighbor search procedures
-\citep{biau2015high},
-an estimator or test statistic
-can be expressed under the null hypothesis
-as the $\ell^p$-norm of a zero-mean
-martingale for some $p \in [1, \infty]$.
-In the notation of Corollary~\ref{cor:yurinskii_sa_martingale},
-it is of interest to bound Kolmogorov--Smirnov
-quantities of the form
-$\sup_{t \geq 0} \big| \P( \|S\|_p \leq t) - \P( \|T\|_p \leq t) \big|$.
-Let $\cB_p$ be the class of closed $\ell^p$-balls in $\R^d$ centered at the
-origin and set
-$\Delta_p(\eta) \vcentcolon= \Delta_p(\cB_p, \eta)
-= \sup_{t \geq 0} \P( t < \|T\|_p \leq t + \eta )$.
-
-\begin{proposition}[Distributional approximation of
-  martingale $\ell^p$-norms]
-  \label{pro:yurinskii_app_application_lp}
-
-  Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
-  with $\Sigma$ non-random. Then for $T \sim \cN(0, \Sigma)$,
-  %
-  \begin{equation}%
-    \label{eq:yurinskii_app_application_lp}
-    \sup_{t \geq 0}
-    \big| \P( \|S\|_p \leq t )
-    - \P\left( \|T\|_p \leq t \right) \big|
-    \leq \inf_{\eta>0}
-    \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}.
-  \end{equation}
-  %
-\end{proposition}
-
-\begin{proof}[Proposition~\ref{pro:yurinskii_app_application_lp}]
-
-  Applying Proposition~\ref{pro:yurinskii_app_clt}
-  with $\cA=\cB_p$ gives
-  %
-  \begin{align*}
-    \sup_{t \geq 0}
-    \big| \P( \|S\|_p \leq t )
-    - \P\left( \|T\|_p \leq t \right) \big|
-    &= \sup_{A\in \cB_p}
-    \big|\P(S\in A) -\P(T\in A)\big| \\
-    &\leq
-    \inf_{\eta>0}
-    \big\{\Gamma_p(\eta) + \Delta_p(\cB_p, \eta) \big\}
-    \leq
-    \inf_{\eta>0}
-    \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}.
-  \end{align*}
-  %
-\end{proof}
-
-The right-hand side of
-\eqref{eq:yurinskii_app_application_lp} can be controlled in various ways.
-%
-In the case of $p=\infty$,
-note that $\ell^\infty$-balls are rectangles so
-$\cB_\infty\subseteq \cR$
-and \eqref{eq:yurinskii_app_rect_anticonc} applies, giving
-$\Delta_\infty(\eta) \leq \eta (\sqrt{2\log d} + 2) / \sigma_{\min}$
-whenever $\min_j \Sigma_{j j} \geq \sigma_{\min}^2$.
-Alternatively, \citet[Theorem~1]{giessing2023anti} provides
-$\Delta_\infty(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_\infty] + \eta^2}$.
-By H{\"o}lder duality of $\ell^p$-norms, we can write
-$\|T\|_p = \sup_{\|u\|_q \leq 1} u^\T T$ where $1/p + 1/q = 1$.
-Applying the Gaussian process anti-concentration result of
-\citet[Theorem~2]{giessing2023anti} yields the more general
-$\Delta_p(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_p] + \eta^2}$.
-Thus, the problem can be reduced to that of bounding
-$\Var\left[\|T\|_p\right]$, with techniques for doing so
-discussed in \citet[Section~4]{giessing2023anti}.
-Alongside the $\ell^p$-norms, other functionals can be analyzed in this manner,
-including the maximum and other order statistics
-\citep{kozbur2021dimension,giessing2023anti}.
-
-To conduct inference in this setting, we must feasibly
-approximate the quantiles of $\|T\|_p$.
-To that end, take a significance level $\tau\in(0,1)$ and set
-%
-$\hat q_p(\tau) =
-\inf \big\{t \in \R: \P(\|\hat T\|_p \leq t \mid \bX) \geq \tau \}$
-where $\hat T \mid \bX \sim \cN(0, \hat\Sigma)$,
-%
-with $\hat\Sigma$ any $\bX$-measurable positive semi-definite
-estimator of $\Sigma$.
-Note that for the canonical estimator $\hat\Sigma = \sum_{i=1}^n X_i X_i^\T$
-we can write $\hat T =\sum_{i=1}^n X_i Z_i$ with
-$Z_1,\dots,Z_n$ i.i.d.\ standard Gaussian independent of $\bX$,
-yielding the Gaussian multiplier bootstrap.
-Now assuming
-the law of $\|\hat T\|_p \mid \bX$ has no atoms,
-we can apply Proposition~\ref{pro:yurinskii_app_bootstrap}
-to see
-%
-\begin{align*}
-  &\sup_{\tau\in(0,1)}
-  \big|\P\left(\|S\|_p \leq \hat q_p(\tau)\right) - \tau \big|
-  \leq
-  \E\left[
-    \sup_{t \geq 0}
-    \big|
-    \P(\|S\|_p \leq t)
-    - \P(\|\hat T\|_p \leq t \mid \bX)
-    \big|
-  \right] \\
-  &\qquad\leq
-  \inf_{\eta>0}
-  \left\{ \Gamma_p(\eta)
-    + 2 \Delta_p(\eta)
-    + 2d\, \E\left[
-      \exp\left(\frac{-\eta^2}
-      {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}\right)
-    \right]
-  \right\},
-\end{align*}
-%
-and hence the bootstrap is valid whenever
-$\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2$ is sufficiently small. See the
-preceding discussion regarding methods for bounding this object.
-
-\begin{remark}[One-dimensional distributional approximations]
-  In our application to distributional approximation of $\ell^p$-norms,
-  the object of interest $\|S\|_p$ is a
-  one-dimensional functional of the high-dimensional martingale;
-  contrast this with the more general Proposition~\ref{pro:yurinskii_app_clt}
-  which
-  directly considers the $d$-dimensional random vector $S$.
-  As such, our coupling-based approach may be improved in certain settings
-  by applying a more carefully tailored smoothing argument.
-  For example, \citet{belloni2018high}
-  employ a ``log sum exponential'' bound
-  \citep[see also][]{chernozhukov2013gaussian}
-  for the maximum statistic
-  $\max_{1 \leq j \leq d} S_j$
-  along with a coupling due to \citet{chernozhukov2014gaussian} to attain
-  an improved dependence on the dimension.
-  Naturally, their approach does not permit the formulation of
-  high-dimensional central limit theorems over arbitrary classes of
-  Borel sets as in our Proposition~\ref{pro:yurinskii_app_clt}.
-\end{remark}
-
-\clearpage
-\addcontentsline{toc}{chapter}{Bibliography}
-\bibliographystyle{phd_dissertation}
-\bibliography{refs}
-
-\end{document}
diff --git a/tests/example2_in.tex b/tests/masters_dissertation_in.tex
similarity index 53%
rename from tests/example2_in.tex
rename to tests/masters_dissertation_in.tex
index addf42e..d123252 100644
--- a/tests/example2_in.tex
+++ b/tests/masters_dissertation_in.tex
@@ -34,9 +34,9 @@
 
 % algorithms
 \usepackage[boxruled,
-  linesnumbered,
-  commentsnumbered,
-  algochapter,
+linesnumbered,
+commentsnumbered,
+algochapter,
 ]{algorithm2e}
 
 % graphics
@@ -47,18 +47,18 @@
 % draft options
 \usepackage{ifdraft}
 \ifoptiondraft{
-  \usepackage{draftwatermark}
-  \SetWatermarkText{DRAFT}
-  \SetWatermarkScale{6}
-  \SetWatermarkColor[rgb]{1,0.9,0.9}
-  \usepackage{showframe}
-  \usepackage{layout}
+\usepackage{draftwatermark}
+\SetWatermarkText{DRAFT}
+\SetWatermarkScale{6}
+\SetWatermarkColor[rgb]{1,0.9,0.9}
+\usepackage{showframe}
+\usepackage{layout}
 }{}
 %\usepackage[obeyDraft]{todonotes}
 
 % hyperlinks
 \usepackage[plainpages=false,draft=false
-  ,hidelinks
+,hidelinks
 ]{hyperref}
 \usepackage{cite}
 
@@ -66,7 +66,7 @@
 \usepackage[nopostdot,nonumberlist]{glossaries}
 
 %TC:endignore
- % suppress pdf warnings
+% suppress pdf warnings
 \pdfsuppresswarningpagegroup=1
 
 \title{Motif\hspace*{0.05cm}-Based Spectral Clustering\\[1ex]
@@ -99,25 +99,25 @@
 % input output definitions
 \makeatletter
 \renewcommand{\SetKwInOut}[2]{%
-  \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}%
-  \expandafter\ifx\csname InOutSizeDefined\endcsname\relax%
-  \newcommand\InOutSizeDefined{}\setlength{\inoutsize}{\wd\algocf@inoutbox}%
-  \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}%
-  {\KwSty{#2}\algocf@typo:\hfill}~}%
-  \setlength{\inoutindent}{\wd\algocf@inoutbox}%
-  \else% else keep the larger dimension
-  \ifdim\wd\algocf@inoutbox>\inoutsize%
-  \setlength{\inoutsize}{\wd\algocf@inoutbox}%
-  \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}%
-  {\KwSty{#2}\algocf@typo:\hfill}~}%
-  \setlength{\inoutindent}{\wd\algocf@inoutbox}%
-  \fi%
-  \fi% the dimension of the box is now defined.
-  \algocf@newcommand{#1}[1]{%
-    \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}%
-    {\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]%
-    {\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~##1\par}%
-    \algocf@linesnumbered% reset the numbering of the lines
+\sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}%
+\expandafter\ifx\csname InOutSizeDefined\endcsname\relax%
+\newcommand\InOutSizeDefined{}\setlength{\inoutsize}{\wd\algocf@inoutbox}%
+\sbox\algocf@inoutbox{\parbox[t]{\inoutsize}%
+{\KwSty{#2}\algocf@typo:\hfill}~}%
+\setlength{\inoutindent}{\wd\algocf@inoutbox}%
+\else% else keep the larger dimension
+\ifdim\wd\algocf@inoutbox>\inoutsize%
+\setlength{\inoutsize}{\wd\algocf@inoutbox}%
+\sbox\algocf@inoutbox{\parbox[t]{\inoutsize}%
+{\KwSty{#2}\algocf@typo:\hfill}~}%
+\setlength{\inoutindent}{\wd\algocf@inoutbox}%
+\fi%
+\fi% the dimension of the box is now defined.
+\algocf@newcommand{#1}[1]{%
+\ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}%
+{\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]%
+{\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~##1\par}%
+\algocf@linesnumbered% reset the numbering of the lines
 }}%
 \makeatother
 
@@ -138,10 +138,10 @@
 % glossaries
 \setlength{\glsdescwidth}{0.92\hsize}
 \newglossarystyle{mystyle}{%
-  \setglossarystyle{long}%
-  \renewenvironment{theglossary}%
-  {\begin{longtable}{@{}p{2cm}p{\glsdescwidth}}}%
-  {\end{longtable}}%
+\setglossarystyle{long}%
+\renewenvironment{theglossary}%
+{\begin{longtable}{@{}p{2cm}p{\glsdescwidth}}}%
+{\end{longtable}}%
 }
 \makeglossaries
 
@@ -154,7 +154,7 @@
 \makeatletter\let\expandableinput\@@input\makeatother
 
 %TC:endignore
- % Glossary
+% Glossary
 
 \newglossaryentry{MAM}{name=MAM, description={Motif adjacency matrix}}
 \newglossaryentry{DSBM}{name=DSBM,
@@ -186,17 +186,17 @@
 \clearpage{}
 \begin{abstract}
 
-  Clustering is an essential technique for network analysis, with applications
-  in a diverse range of fields. Although spectral clustering is a popular and
-  effective method, it fails to consider higher-order structure and can perform
-  poorly on directed networks. We aim to address these shortcomings by
-  exploring motif-based spectral clustering methods. We present new matrix
-  formulae for motif adjacency matrices, and a motif-based approach for
-  clustering bipartite networks. Comprehensive experimental results from both
-  synthetic and real data demonstrate the effectiveness of our techniques on a
-  variety of networks. We conclude that motif-based spectral clustering is a
-  valuable tool for analysis of directed and bipartite weighted networks, which
-  is also scalable and easy to implement.
+Clustering is an essential technique for network analysis, with applications
+in a diverse range of fields. Although spectral clustering is a popular and
+effective method, it fails to consider higher-order structure and can perform
+poorly on directed networks. We aim to address these shortcomings by
+exploring motif-based spectral clustering methods. We present new matrix
+formulae for motif adjacency matrices, and a motif-based approach for
+clustering bipartite networks. Comprehensive experimental results from both
+synthetic and real data demonstrate the effectiveness of our techniques on a
+variety of networks. We conclude that motif-based spectral clustering is a
+valuable tool for analysis of directed and bipartite weighted networks, which
+is also scalable and easy to implement.
 
 \end{abstract}
 \clearpage{}
@@ -205,19 +205,19 @@
 \pagenumbering{arabic}
 \begin{romanpages}
 
-  \tableofcontents
+\tableofcontents
 
-  \newpage
-  \listoffigures
+\newpage
+\listoffigures
 
-  \newpage
-  \listoftables
+\newpage
+\listoftables
 
-  \begingroup
-  \let\cleardoublepage\relax
-  \let\clearpage\relax
-  \printglossary[title=Abbreviations, style=mystyle]
-  \endgroup
+\begingroup
+\let\cleardoublepage\relax
+\let\clearpage\relax
+\printglossary[title=Abbreviations, style=mystyle]
+\endgroup
 
 \end{romanpages}
 
@@ -321,94 +321,94 @@ \section{Graph definitions} \label{sec:graphs_graph_definitions}
 notation and definitions.
 
 \begin{definition}[Graphs]
-  A \emph{graph} is a triple $\ca{G} = (\ca{V,E},W)$ where $\ca{V}$ is the
-  \emph{vertex set}, $\ca{E} \subseteq \left\{ (i,j) : i,j \in \ca{V}, i \neq j
-  \right\}$ is the \emph{edge set} and $W\colon \ca{E} \to (0,\infty)$ is the
-  \emph{weight map}.
+A \emph{graph} is a triple $\ca{G} = (\ca{V,E},W)$ where $\ca{V}$ is the
+\emph{vertex set}, $\ca{E} \subseteq \left\{ (i,j) : i,j \in \ca{V}, i \neq j
+\right\}$ is the \emph{edge set} and $W\colon \ca{E} \to (0,\infty)$ is the
+\emph{weight map}.
 \end{definition}
 
 \begin{remark}
-  We consider weighted directed graphs without self-loops or multiple edges. We
-  can extend to undirected graphs by replacing undirected edges with
-  bidirectional edges. Where it is not relevant, we may sometimes omit the
-  weight map $W$.
+We consider weighted directed graphs without self-loops or multiple edges. We
+can extend to undirected graphs by replacing undirected edges with
+bidirectional edges. Where it is not relevant, we may sometimes omit the
+weight map $W$.
 \end{remark}
 
 \begin{definition}[Underlying edges]
-  Let $\ca{G} = (\ca{V,E})$ be a graph. Its \emph{underlying edges} are
-  $\bar{\ca{E}} \vcentcolon = \big\{ \{i,j\} : (i,j) \in \ca{E} \big\}$.
+Let $\ca{G} = (\ca{V,E})$ be a graph. Its \emph{underlying edges} are
+$\bar{\ca{E}} \vcentcolon = \big\{ \{i,j\} : (i,j) \in \ca{E} \big\}$.
 \end{definition}
 
 \begin{definition}[Subgraphs]
-  A graph $\ca{G'} = (\ca{V',E'})$ is a \emph{subgraph} of a graph $\ca{G} =
-  (\ca{V,E})$ (write $\ca{G'} \leq \ca{G}$) if $\ca{V'} \subseteq \ca{V}$ and
-  $\ca{E'} \subseteq \ca{E}$. It is an \emph{induced subgraph} (write $\ca{G'}
-  < \ca{G}$) if further $\ca{E'} = \ca{E} \cap ( \ca{V'} \times \ca{V'} )$.
+A graph $\ca{G'} = (\ca{V',E'})$ is a \emph{subgraph} of a graph $\ca{G} =
+(\ca{V,E})$ (write $\ca{G'} \leq \ca{G}$) if $\ca{V'} \subseteq \ca{V}$ and
+$\ca{E'} \subseteq \ca{E}$. It is an \emph{induced subgraph} (write $\ca{G'}
+< \ca{G}$) if further $\ca{E'} = \ca{E} \cap ( \ca{V'} \times \ca{V'} )$.
 \end{definition}
 
 \begin{definition}[Connected components]
-  Let $\ca{G} = (\ca{V,E})$ be a graph. The \emph{connected components} of
-  $\ca{G}$ are the partition $\ca{C}$ generated by the transitive closure of
-  the relation $\sim$ on $\ca{V}$ defined by $i \sim j \iff \{i,j\} \in
-  \bar{\ca{E}}$. We say $\ca{G}$ is (weakly) \emph{connected} if $|\ca{C}| =
-  1$.
+Let $\ca{G} = (\ca{V,E})$ be a graph. The \emph{connected components} of
+$\ca{G}$ are the partition $\ca{C}$ generated by the transitive closure of
+the relation $\sim$ on $\ca{V}$ defined by $i \sim j \iff \{i,j\} \in
+\bar{\ca{E}}$. We say $\ca{G}$ is (weakly) \emph{connected} if $|\ca{C}| =
+1$.
 \end{definition}
 
 \begin{definition}[Graph isomorphisms]
-  A graph $\ca{G'} = (\ca{V',E'})$ is \emph{isomorphic} to a graph $\ca{G} =
-  (\ca{V,E})$ (write $\ca{G'} \cong \ca{G}$) if there is a bijection
-  $\phi\colon \ca{V'} \rightarrow \ca{V}$ with $(u,v) \in \ca{E'} \iff
-  \big(\phi(u), \phi(v) \big) \in \ca{E}$.
-  An isomorphism from a graph to itself is called an \emph{automorphism}.
+A graph $\ca{G'} = (\ca{V',E'})$ is \emph{isomorphic} to a graph $\ca{G} =
+(\ca{V,E})$ (write $\ca{G'} \cong \ca{G}$) if there is a bijection
+$\phi\colon \ca{V'} \rightarrow \ca{V}$ with $(u,v) \in \ca{E'} \iff
+\big(\phi(u), \phi(v) \big) \in \ca{E}$.
+An isomorphism from a graph to itself is called an \emph{automorphism}.
 \end{definition}
 
 \begin{definition}[Motifs and anchor sets]
-  A \emph{motif} is a pair $(\ca{M,A})$ where $\ca{M} = (\ca{V_M,E_M})$ is a
-  connected graph with $\ca{V_M} = \{ 1, \ldots, m \}$ for some small $m \geq
-  2$, and $\ca{A} \subseteq \ca{V_M}$ with $|\ca{A}| \geq 2$ is an \emph{anchor
-  set}. If $\ca{A} \neq \ca{V_M}$ we say the motif is \emph{anchored}, and if
-  $\ca{A=V_M}$ we say it is \emph{simple}.
+A \emph{motif} is a pair $(\ca{M,A})$ where $\ca{M} = (\ca{V_M,E_M})$ is a
+connected graph with $\ca{V_M} = \{ 1, \ldots, m \}$ for some small $m \geq
+2$, and $\ca{A} \subseteq \ca{V_M}$ with $|\ca{A}| \geq 2$ is an \emph{anchor
+set}. If $\ca{A} \neq \ca{V_M}$ we say the motif is \emph{anchored}, and if
+$\ca{A=V_M}$ we say it is \emph{simple}.
 \end{definition}
 
 \begin{remark}
-  Anchor sets~\cite{benson2016higher} specify which r\^oles vertices play in
-  the motif, and are crucial for defining the collider and expander motifs
-  given in Section~\ref{sec:coll_expa}. When an anchor set is not given, it is
-  assumed that the motif is simple. Figure~\ref{fig:motif_definitions_directed}
-  shows all simple motifs (up to isomorphism) on at most three vertices.
+Anchor sets~\cite{benson2016higher} specify which r\^oles vertices play in
+the motif, and are crucial for defining the collider and expander motifs
+given in Section~\ref{sec:coll_expa}. When an anchor set is not given, it is
+assumed that the motif is simple. Figure~\ref{fig:motif_definitions_directed}
+shows all simple motifs (up to isomorphism) on at most three vertices.
 \end{remark}
 
 \begin{definition}[Instances]
-  Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. We say that $\ca{H}$ is a
-  \emph{functional instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H}
-  \leq \ca{G}$. We say that $\ca{H}$ is a \emph{structural instance} of
-  $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} < \ca{G}$.
+Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. We say that $\ca{H}$ is a
+\emph{functional instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H}
+\leq \ca{G}$. We say that $\ca{H}$ is a \emph{structural instance} of
+$\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} < \ca{G}$.
 \end{definition}
 
 \begin{definition}[Anchored pairs]
-  Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. Suppose $\ca{H}$ is an
-  instance of $\ca{M}$ in $\ca{G}$. Define the \emph{anchored pairs of the
-  instance} $\ca{H}$ as
-  $$ \ca{A(H)} \vcentcolon = \big\{ \{\phi(i),\phi(j)\} : i,j \in \ca{A}, \ i
-    \neq j, \ \phi \textrm{ is an isomorphism from } \ca{M} \textrm{ to } \ca{H}
-  \big\}\,.$$
+Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. Suppose $\ca{H}$ is an
+instance of $\ca{M}$ in $\ca{G}$. Define the \emph{anchored pairs of the
+instance} $\ca{H}$ as
+$$ \ca{A(H)} \vcentcolon = \big\{ \{\phi(i),\phi(j)\} : i,j \in \ca{A}, \ i
+\neq j, \ \phi \textrm{ is an isomorphism from } \ca{M} \textrm{ to } \ca{H}
+\big\}\,.$$
 
 \end{definition}
 
 \begin{remark}
-  Example~\ref{ex:instances} demonstrates functional and structural instances.
-  Note that $\{i,j\} \in \ca{A(H)}$ if and only if $\ca{H}$ appears in $\ca{G}$
-  as an instance of $\ca{M}$ with $i \neq j$ co-appearing in the image of
-  $\ca{A}$ under isomorphism. The motivation for this is that clustering
-  methods should avoid separating vertices which appear as an anchored pair.
+Example~\ref{ex:instances} demonstrates functional and structural instances.
+Note that $\{i,j\} \in \ca{A(H)}$ if and only if $\ca{H}$ appears in $\ca{G}$
+as an instance of $\ca{M}$ with $i \neq j$ co-appearing in the image of
+$\ca{A}$ under isomorphism. The motivation for this is that clustering
+methods should avoid separating vertices which appear as an anchored pair.
 \end{remark}
 %
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.7,draft=false]{%
-  ../tikz/motif_definitions_directed/motif_definitions_directed.pdf}
-  \caption{All simple motifs on at most three vertices}
-  \label{fig:motif_definitions_directed}
+\centering
+\includegraphics[scale=0.7,draft=false]{%
+../tikz/motif_definitions_directed/motif_definitions_directed.pdf}
+\caption{All simple motifs on at most three vertices}
+\label{fig:motif_definitions_directed}
 \end{figure}
 
 \section{Adjacency and indicator matrices}
@@ -421,34 +421,34 @@ \section{Adjacency and indicator matrices}
 Table~\ref{tab:motif_adj_mat_table}.
 
 \begin{definition}[Adjacency matrices]
-  Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots,
-  n \}$. The \emph{adjacency matrix, single-edge adjacency matrix} and
-  \emph{double-edge adjacency matrix} of $\ca{G}$ are respectively the $n
-  \times n$ matrices
-  \begin{align*}
-    G_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \}\,, \\
-    (G_\mathrm{s})_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E}
-    \textrm{ and } (j,i) \notin \ca{E} \}\,, \\
-    (G_\mathrm{d})_{i j} &\vcentcolon= \big( W((i,j)) + W((j,i)) \big) \ \bb{I}
-    \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,.
-  \end{align*}
+Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots,
+n \}$. The \emph{adjacency matrix, single-edge adjacency matrix} and
+\emph{double-edge adjacency matrix} of $\ca{G}$ are respectively the $n
+\times n$ matrices
+\begin{align*}
+G_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \}\,, \\
+(G_\mathrm{s})_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E}
+\textrm{ and } (j,i) \notin \ca{E} \}\,, \\
+(G_\mathrm{d})_{i j} &\vcentcolon= \big( W((i,j)) + W((j,i)) \big) \ \bb{I}
+\{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,.
+\end{align*}
 \end{definition}
 
 \begin{definition}[Indicator matrices]
-  Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots,
-  n \}$. The \emph{indicator matrix, single-edge indicator matrix, double-edge
-  indicator matrix, missing-edge indicator matrix} and \emph{vertex-distinct
-  indicator matrix} of $\ca{G}$ are respectively the $n \times n$ matrices
-  \begin{align*}
-    J_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \}\,, \\
-    (J_\mathrm{s})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and }
-    (j,i) \notin \ca{E} \}\,, \\
-    (J_\mathrm{d})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and }
-    (j,i) \in \ca{E} \}\,, \\
-    (J_0)_{i j} &\vcentcolon= \bb{I} \{ (i,j) \notin \ca{E} \textrm{ and } (j,i)
-    \notin \ca{E} \textrm{ and } i \neq j \}\,, \\
-    (J_\mathrm{n})_{i j} &\vcentcolon= \bb{I} \{ i \neq j \}\,.
-  \end{align*}
+Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots,
+n \}$. The \emph{indicator matrix, single-edge indicator matrix, double-edge
+indicator matrix, missing-edge indicator matrix} and \emph{vertex-distinct
+indicator matrix} of $\ca{G}$ are respectively the $n \times n$ matrices
+\begin{align*}
+J_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \}\,, \\
+(J_\mathrm{s})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and }
+(j,i) \notin \ca{E} \}\,, \\
+(J_\mathrm{d})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and }
+(j,i) \in \ca{E} \}\,, \\
+(J_0)_{i j} &\vcentcolon= \bb{I} \{ (i,j) \notin \ca{E} \textrm{ and } (j,i)
+\notin \ca{E} \textrm{ and } i \neq j \}\,, \\
+(J_\mathrm{n})_{i j} &\vcentcolon= \bb{I} \{ i \neq j \}\,.
+\end{align*}
 \end{definition}
 
 \section{Motif adjacency matrices} \label{sec:graphs_motif_adj_matrices}
@@ -466,32 +466,32 @@ \section{Motif adjacency matrices} \label{sec:graphs_motif_adj_matrices}
 \subsection{Definitions}
 
 \begin{definition}[Motif adjacency matrices] \label{def:motif_adj_matrices}
-  %
-  Let $\ca{G} = (\ca{V,E},W)$ be a graph with $n$ vertices and let $\ca{(M,A)}$
-  be a motif. The \emph{functional} and \emph{structural motif adjacency
-  matrices} (MAMs) of $\ca{(M,A)}$ in $\ca{G}$ are respectively the $n \times
-  n$ matrices
-  %
-  \begin{align*}
-    M^\mathrm{func}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong
-    \ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e
-    \in \ca{E_H}} W(e)\,, \\
-    M^\mathrm{struc}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong
-    \ca{H} < \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e
-    \in \ca{E_H}} W(e)\,.
-  \end{align*}
+%
+Let $\ca{G} = (\ca{V,E},W)$ be a graph with $n$ vertices and let $\ca{(M,A)}$
+be a motif. The \emph{functional} and \emph{structural motif adjacency
+matrices} (MAMs) of $\ca{(M,A)}$ in $\ca{G}$ are respectively the $n \times
+n$ matrices
+%
+\begin{align*}
+M^\mathrm{func}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong
+\ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e
+\in \ca{E_H}} W(e)\,, \\
+M^\mathrm{struc}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong
+\ca{H} < \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e
+\in \ca{E_H}} W(e)\,.
+\end{align*}
 \end{definition}
 
 \begin{remark}
-  Example~\ref{ex:motif_adj_matrices} gives a simple illustration of
-  calculating an MAM.
-  When $W \equiv 1$ and $\ca{M}$ is simple, the (functional or structural) MAM
-  entry $M_{i j} \ (i \neq j)$ simply counts the (functional or structural)
-  instances of $\ca{M}$ in $\ca{G}$ containing $i$ and $j$.
-  When $\ca{M}$ is not simple, $M_{i j}$ counts only those instances with anchor
-  sets containing both $i$ and $j$.
-  MAMs are always symmetric, since the only dependency on $(i,j)$ is via the
-  unordered set $\{i,j\}$.
+Example~\ref{ex:motif_adj_matrices} gives a simple illustration of
+calculating an MAM.
+When $W \equiv 1$ and $\ca{M}$ is simple, the (functional or structural) MAM
+entry $M_{i j} \ (i \neq j)$ simply counts the (functional or structural)
+instances of $\ca{M}$ in $\ca{G}$ containing $i$ and $j$.
+When $\ca{M}$ is not simple, $M_{i j}$ counts only those instances with anchor
+sets containing both $i$ and $j$.
+MAMs are always symmetric, since the only dependency on $(i,j)$ is via the
+unordered set $\{i,j\}$.
 \end{remark}
 
 \subsection{Computation} \label{sec:graphs_computation}
@@ -500,96 +500,96 @@ \subsection{Computation} \label{sec:graphs_computation}
 and~\ref{prop:motif_adj_matrix_computation}, we need one more definition.
 
 \begin{definition}[Anchored automorphism classes]
-  Let $(\ca{M,A})$ be a motif.
-  Let $S_\ca{M}$ be the set of permutations on $ \ca{V_M} = \{ 1, \ldots, m \}$
-  and define the \emph{anchor-preserving permutations} $S_\ca{M,A} = \{ \sigma
-  \in S_\ca{M} : \{1,m\} \subseteq \sigma(\ca{A}) \}$.
-  Let $\sim$ be the equivalence relation defined on $S_\ca{M,A}$ by: $\sigma
-  \sim \tau \iff \tau^{-1} \sigma$ is an automorphism of $\ca{M}$.
-  Finally the \emph{anchored automorphism classes} are the quotient set
-  $S_\ca{M,A}^\sim \vcentcolon= S_\ca{M,A} \ \big/ \sim$\,.
+Let $(\ca{M,A})$ be a motif.
+Let $S_\ca{M}$ be the set of permutations on $ \ca{V_M} = \{ 1, \ldots, m \}$
+and define the \emph{anchor-preserving permutations} $S_\ca{M,A} = \{ \sigma
+\in S_\ca{M} : \{1,m\} \subseteq \sigma(\ca{A}) \}$.
+Let $\sim$ be the equivalence relation defined on $S_\ca{M,A}$ by: $\sigma
+\sim \tau \iff \tau^{-1} \sigma$ is an automorphism of $\ca{M}$.
+Finally the \emph{anchored automorphism classes} are the quotient set
+$S_\ca{M,A}^\sim \vcentcolon= S_\ca{M,A} \ \big/ \sim$\,.
 \end{definition}
 
 \begin{proposition}[MAM formula] \label{prop:motif_adj_matrix_formula}
-  Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set
-  ${\ca{V}=\{1,\ldots,n\}}$ and let $(\ca{M,A})$ be a motif on $m$ vertices.
-  Then for any $i,j \in \ca{V}$ and with $k_1 = i$, $k_m = j$, the functional
-  and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are given by
-  %
-  %
-  \begin{align*}
-    M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in
-    S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \
-    J^\mathrm{func}_{\mathbf{k},\sigma} \
-    G^\mathrm{func}_{\mathbf{k},\sigma}\,, &(1) \\
-    M^\mathrm{struc}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in
-    S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \
-    J^\mathrm{struc}_{\mathbf{k},\sigma} \
-    G^\mathrm{struc}_{\mathbf{k},\sigma}\,, &(2)
-  \end{align*}
-  %
-  where
-  %
-  \begin{align*}
-    \ca{E}_\ca{M}^0 &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \notin
-    \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\
-    \ca{E}_\ca{M}^\mathrm{s} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m :
-    (u,v) \in \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\
-    \ca{E}_\ca{M}^\mathrm{d} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m :
-    (u,v) \in \ca{E_M}, (v,u) \in \ca{E_M} \}\,,
-  \end{align*}
-  %
-  are respectively the missing edges, single edges and double edges of
-  $\ca{E_M}$, and
-  %
-  %TC:ignore
-  \begin{alignat*}{3}
-    %
-    J^\mathrm{func}_{\mathbf{k},\sigma}
-    & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma
-    u},k_{\sigma v}}
-    && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}}\,, \\
-    %
-    G^\mathrm{func}_{\mathbf{k},\sigma}
-    & \vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma
-    v}}
-    && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma
-    u},k_{\sigma v}}\,, \\
-    %
-    J^\mathrm{struc}_{\mathbf{k},\sigma}
-    & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_0)_{k_{\sigma u},k_{\sigma v}}
-    && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma
-    u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}}\,, \\
-    %
-    G^\mathrm{struc}_{\mathbf{k},\sigma}
-    &\vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma
-    u},k_{\sigma v}}
-    && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma
-    u},k_{\sigma v}}\,.
-    %
-  \end{alignat*}
-  %TC:endignore
+Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set
+${\ca{V}=\{1,\ldots,n\}}$ and let $(\ca{M,A})$ be a motif on $m$ vertices.
+Then for any $i,j \in \ca{V}$ and with $k_1 = i$, $k_m = j$, the functional
+and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are given by
+%
+%
+\begin{align*}
+M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in
+S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \
+J^\mathrm{func}_{\mathbf{k},\sigma} \
+G^\mathrm{func}_{\mathbf{k},\sigma}\,, &(1) \\
+M^\mathrm{struc}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in
+S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \
+J^\mathrm{struc}_{\mathbf{k},\sigma} \
+G^\mathrm{struc}_{\mathbf{k},\sigma}\,, &(2)
+\end{align*}
+%
+where
+%
+\begin{align*}
+\ca{E}_\ca{M}^0 &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \notin
+\ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\
+\ca{E}_\ca{M}^\mathrm{s} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m :
+(u,v) \in \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\
+\ca{E}_\ca{M}^\mathrm{d} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m :
+(u,v) \in \ca{E_M}, (v,u) \in \ca{E_M} \}\,,
+\end{align*}
+%
+are respectively the missing edges, single edges and double edges of
+$\ca{E_M}$, and
+%
+%TC:ignore
+\begin{alignat*}{3}
+%
+J^\mathrm{func}_{\mathbf{k},\sigma}
+& \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma
+u},k_{\sigma v}}
+&& && \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}}\,, \\
+%
+G^\mathrm{func}_{\mathbf{k},\sigma}
+& \vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma
+v}}
+&& + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma
+u},k_{\sigma v}}\,, \\
+%
+J^\mathrm{struc}_{\mathbf{k},\sigma}
+& \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_0)_{k_{\sigma u},k_{\sigma v}}
+&& && \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma
+u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}}\,, \\
+%
+G^\mathrm{struc}_{\mathbf{k},\sigma}
+&\vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma
+u},k_{\sigma v}}
+&& + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma
+u},k_{\sigma v}}\,.
+%
+\end{alignat*}
+%TC:endignore
 \end{proposition}
 %
 \begin{proof}
-  See Proof~\ref{proof:motif_adj_matrix_formula}.
+See Proof~\ref{proof:motif_adj_matrix_formula}.
 \end{proof}
 
 \begin{proposition}[Complexity of MAM formula]
-  \label{prop:motif_adj_matrix_computation}
-  Suppose that ${m \leq 3}$, and the adjacency matrix $G$ of $\ca{G}$ is known.
-  Then computing adjacency and indicator matrices and calculating an MAM using
-  Equations $(1)$ and $(2)$ in Proposition~\ref{prop:motif_adj_matrix_formula}
-  involves at most 18 matrix multiplications, 22 entry-wise multiplications and
-  21 additions of (typically sparse) $n \times n$ matrices.
+\label{prop:motif_adj_matrix_computation}
+Suppose that ${m \leq 3}$, and the adjacency matrix $G$ of $\ca{G}$ is known.
+Then computing adjacency and indicator matrices and calculating an MAM using
+Equations $(1)$ and $(2)$ in Proposition~\ref{prop:motif_adj_matrix_formula}
+involves at most 18 matrix multiplications, 22 entry-wise multiplications and
+21 additions of (typically sparse) $n \times n$ matrices.
 \end{proposition}
 
 \begin{proof}
-  See Proof~\ref{proof:motif_adj_matrix_computation}.
+See Proof~\ref{proof:motif_adj_matrix_computation}.
 \end{proof}
 
 Hence for motifs on at most three vertices and with sparse adjacency matrices,
@@ -664,32 +664,32 @@ \section{Graph Laplacians} \label{sec:spectral_laplacians}
 computation \cite{von2007tutorial, luxburg2004convergence}.
 
 \begin{definition}
-  Let $\ca{G}$ be an undirected graph with (symmetric) adjacency matrix $G$. The
-  \emph{random-walk Laplacian matrix} of $\ca{G}$ is
-  $$ L_\mathrm{rw} \vcentcolon= I - D^{-1} G $$
-  where $I$ is the identity and $D_{ii} \vcentcolon= \sum_j G_{i j}$ is the
-  diagonal matrix of weighted degrees.
+Let $\ca{G}$ be an undirected graph with (symmetric) adjacency matrix $G$. The
+\emph{random-walk Laplacian matrix} of $\ca{G}$ is
+$$ L_\mathrm{rw} \vcentcolon= I - D^{-1} G $$
+where $I$ is the identity and $D_{ii} \vcentcolon= \sum_j G_{i j}$ is the
+diagonal matrix of weighted degrees.
 \end{definition}
 
 \begin{remark}
-  $D^{-1} G$ is the transition matrix of a random walk on the vertex set
-  $\ca{V}$
-  where the probability of the transition $v_i \to v_j$ is proportional to
-  $G_{i j}$.
+$D^{-1} G$ is the transition matrix of a random walk on the vertex set
+$\ca{V}$
+where the probability of the transition $v_i \to v_j$ is proportional to
+$G_{i j}$.
 \end{remark}
 
 \begin{proposition}[Properties of the random-walk Laplacian]
-  \label{prop:laplacian}
-  $L_\mathrm{rw}$ is positive semi-definite with eigenvalues $0 = \lambda_1 \leq
-  \cdots \leq \lambda_n$.
-  The multiplicity $k$ of the eigenvalue $0$ is equal to the number of connected
-  components $\ca{P}_1, \ldots, \ca{P}_k$ of $\ca{G}$.
-  The eigenspace of the eigenvalue $0$ is spanned by the indicator vectors on
-  these components; $ \bb{I}_{\ca{P}_1}, \ldots, \bb{I}_{\ca{P}_k} $.
+\label{prop:laplacian}
+$L_\mathrm{rw}$ is positive semi-definite with eigenvalues $0 = \lambda_1 \leq
+\cdots \leq \lambda_n$.
+The multiplicity $k$ of the eigenvalue $0$ is equal to the number of connected
+components $\ca{P}_1, \ldots, \ca{P}_k$ of $\ca{G}$.
+The eigenspace of the eigenvalue $0$ is spanned by the indicator vectors on
+these components; $ \bb{I}_{\ca{P}_1}, \ldots, \bb{I}_{\ca{P}_k} $.
 \end{proposition}
 
 \begin{proof}
-  See \cite{von2007tutorial}.
+See \cite{von2007tutorial}.
 \end{proof}
 
 \section{Graph cuts} \label{sec:spectral_graph_cut}
@@ -700,25 +700,25 @@ \section{Graph cuts} \label{sec:spectral_graph_cut}
 Laplacian.
 
 \begin{definition}
-  Let $\ca{G}$ be a graph. Let $ \ca{P}_1, \ldots, \ca{P}_k $ be a partition of
-  $\ca{V}$. Then the \emph{normalised cut} \cite{shi2000normalized} of $\ca{G}$
-  with respect to $ \ca{P}_1, \ldots, \ca{P}_k $ is
-  %
-  $$ \mathrm{Ncut}_\ca{G}(\ca{P}_1, \ldots, \ca{P}_k) \vcentcolon= \frac{1}{2}
-  \sum_{i=1}^k
-  \frac{ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) }{ \mathrm{vol}(\ca{P}_i) } $$
-  %
-  where $ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i})
-  \vcentcolon= \sum_{u \in \ca{P}_i, \, v \in \ca{V} \setminus \ca{P}_i}
-  G_{u v}$
-  and $\mathrm{vol}(\ca{P}_i) \vcentcolon= \sum_{u \in \ca{P}_i} D_{u u}$.
+Let $\ca{G}$ be a graph. Let $ \ca{P}_1, \ldots, \ca{P}_k $ be a partition of
+$\ca{V}$. Then the \emph{normalised cut} \cite{shi2000normalized} of $\ca{G}$
+with respect to $ \ca{P}_1, \ldots, \ca{P}_k $ is
+%
+$$ \mathrm{Ncut}_\ca{G}(\ca{P}_1, \ldots, \ca{P}_k) \vcentcolon= \frac{1}{2}
+\sum_{i=1}^k
+\frac{ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) }{ \mathrm{vol}(\ca{P}_i) } $$
+%
+where $ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i})
+\vcentcolon= \sum_{u \in \ca{P}_i, \, v \in \ca{V} \setminus \ca{P}_i}
+G_{u v}$
+and $\mathrm{vol}(\ca{P}_i) \vcentcolon= \sum_{u \in \ca{P}_i} D_{u u}$.
 
 \end{definition}
 
 \begin{remark}
-  More desirable partitions have a lower Ncut value; the numerators penalise
-  partitions which cut a large number of heavily weighted edges, and the
-  denominators penalise partitions which have highly imbalanced cluster sizes.
+More desirable partitions have a lower Ncut value; the numerators penalise
+partitions which cut a large number of heavily weighted edges, and the
+denominators penalise partitions which have highly imbalanced cluster sizes.
 \end{remark}
 
 It can be shown \cite{von2007tutorial} that minimising Ncut over partitions
@@ -766,36 +766,36 @@ \subsection{Eigenvector sweep} \label{sec:spectral_sweep}
 \pagebreak
 
 \begin{algorithm}[H]
-  \caption{Eigenvector sweep}
-  \label{alg:eigenvector_sweep}
-
-  \SetKwFunction{Main}{EigenvectorSweep}
-  \newcommand{\MainArgs}{$\ca{G}, x$}
-
-  \BlankLine
-  \Input{Graph $\ca{G}$, eigenvector $x$}
-  \Output{Partition $\ca{P}_1, \ca{P}_2$}
-  \BlankLine
-  \Function{\Main{\MainArgs}}{
-
-    $\hat{x} \leftarrow \mathtt{sort}(x)$ \;
-    $\mathrm{Score_{best}} \leftarrow \infty$ \;
-
-    \For{$i$ \In $1, \ldots, n-1$}{
-      $\ca{P} \leftarrow \{ \hat{x}_1, \ldots \hat{x}_i \}$ \;
-      $\mathrm{Score} \leftarrow \mathrm{Ncut}_\ca{G} (\ca{P}, \ca{V}
-      \setminus \ca{P})$ \;
-      \If{$\mathrm{Score} < \mathrm{Score_{best}}$}{
-        $\ca{P}_\mathrm{best} \leftarrow \ca{P}$ \;
-        $\mathrm{Score_{best}} \leftarrow \mathrm{Score}$ \;
-      }
-    }
-
-    $\ca{P}_1 \leftarrow \ca{P}_\mathrm{best}$ \;
-    $\ca{P}_2 \leftarrow \ca{V} \setminus \ca{P}_\mathrm{best}$ \;
-
-    \Return $\ca{P}_1, \ca{P}_2$
-  }
+\caption{Eigenvector sweep}
+\label{alg:eigenvector_sweep}
+
+\SetKwFunction{Main}{EigenvectorSweep}
+\newcommand{\MainArgs}{$\ca{G}, x$}
+
+\BlankLine
+\Input{Graph $\ca{G}$, eigenvector $x$}
+\Output{Partition $\ca{P}_1, \ca{P}_2$}
+\BlankLine
+\Function{\Main{\MainArgs}}{
+
+$\hat{x} \leftarrow \mathtt{sort}(x)$ \;
+$\mathrm{Score_{best}} \leftarrow \infty$ \;
+
+\For{$i$ \In $1, \ldots, n-1$}{
+$\ca{P} \leftarrow \{ \hat{x}_1, \ldots \hat{x}_i \}$ \;
+$\mathrm{Score} \leftarrow \mathrm{Ncut}_\ca{G} (\ca{P}, \ca{V}
+\setminus \ca{P})$ \;
+\If{$\mathrm{Score} < \mathrm{Score_{best}}$}{
+$\ca{P}_\mathrm{best} \leftarrow \ca{P}$ \;
+$\mathrm{Score_{best}} \leftarrow \mathrm{Score}$ \;
+}
+}
+
+$\ca{P}_1 \leftarrow \ca{P}_\mathrm{best}$ \;
+$\ca{P}_2 \leftarrow \ca{V} \setminus \ca{P}_\mathrm{best}$ \;
+
+\Return $\ca{P}_1, \ca{P}_2$
+}
 \end{algorithm}
 \vspace*{0.5cm}
 
@@ -809,23 +809,23 @@ \subsection{Eigenvector sweep} \label{sec:spectral_sweep}
 %
 %
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../tikz/eigenvector_sweep_network/eigenvector_sweep_network.pdf}
-    \caption{A small network}
-    \label{fig:eigenvector_sweep_network}
-  \end{subfigure}
-  %
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/eigenvector_sweep/eigenvector_sweep_scores.pdf}
-    \caption{Sweep profile of the network}
-    \label{fig:eigenvector_sweep_profile}
-  \end{subfigure}
-  \caption{Eigenvector sweep selects a partition by minimising Ncut}
-  \label{fig:eigenvector_sweep}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../tikz/eigenvector_sweep_network/eigenvector_sweep_network.pdf}
+\caption{A small network}
+\label{fig:eigenvector_sweep_network}
+\end{subfigure}
+%
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/eigenvector_sweep/eigenvector_sweep_scores.pdf}
+\caption{Sweep profile of the network}
+\label{fig:eigenvector_sweep_profile}
+\end{subfigure}
+\caption{Eigenvector sweep selects a partition by minimising Ncut}
+\label{fig:eigenvector_sweep}
 \end{figure}
 %
 
@@ -865,30 +865,30 @@ \subsection{Random-walk spectral clustering}
 
 \vspace*{0.5cm}
 \begin{algorithm}[H]
-  \caption{Random-walk spectral clustering}
-  \label{alg:rwspectclust}
-
-  \SetKwFunction{Main}{RWSpectClust}
-  \newcommand{\MainArgs}{$G,k,l$}
-
-  \BlankLine
-  \Input{Symmetric adjacency matrix $G$, number of clusters $k$, dimension
-  $l$}
-  \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$}
-  \BlankLine
-
-  \Function{\Main{\MainArgs}}{
-    Construct the weighted degree matrix $D_{ii} \leftarrow \sum_j G_{i j}$
-    \\
-    Construct the random walk Laplacian matrix $L_\mathrm{rw} \leftarrow
-    I-D^{-1}G$ \\
-    Let $H$ have the first $l$ eigenvectors of $L_\mathrm{rw}$ as columns
-    \\
-    Drop the first column of $H$ \\
-    Run $k$-means++ on the rows of $H$ with $k$ clusters to produce
-    $\ca{P}_1, \ldots, \ca{P}_k$ \\
-    \Return $\ca{P}_1, \ldots, \ca{P}_k$
-  }
+\caption{Random-walk spectral clustering}
+\label{alg:rwspectclust}
+
+\SetKwFunction{Main}{RWSpectClust}
+\newcommand{\MainArgs}{$G,k,l$}
+
+\BlankLine
+\Input{Symmetric adjacency matrix $G$, number of clusters $k$, dimension
+$l$}
+\Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$}
+\BlankLine
+
+\Function{\Main{\MainArgs}}{
+Construct the weighted degree matrix $D_{ii} \leftarrow \sum_j G_{i j}$
+\\
+Construct the random walk Laplacian matrix $L_\mathrm{rw} \leftarrow
+I-D^{-1}G$ \\
+Let $H$ have the first $l$ eigenvectors of $L_\mathrm{rw}$ as columns
+\\
+Drop the first column of $H$ \\
+Run $k$-means++ on the rows of $H$ with $k$ clusters to produce
+$\ca{P}_1, \ldots, \ca{P}_k$ \\
+\Return $\ca{P}_1, \ldots, \ca{P}_k$
+}
 
 \end{algorithm}
 
@@ -934,27 +934,27 @@ \subsection{Motif-based random-walk spectral clustering}
 
 \vspace*{0.5cm}
 \begin{algorithm}[H]
-  \caption{Motif-based random-walk spectral clustering}
-  \label{alg:motifrwspectclust}
-
-  \SetKwFunction{Main}{MotifRWSpectClust}
-  \newcommand{\MainArgs}{$\ca{G},\mathcal{M},k,l$}
-
-  \BlankLine
-  \Input{Graph $\ca{G}$, motif $\ca{M}$, number of clusters $k$, dimension
-  $l$}
-  \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$}
-  \BlankLine
-
-  \Function{\Main{\MainArgs}}{
-    Construct the motif adjacency matrix $M$ of the graph $\ca{G}$ with
-    motif $\ca{M}$ \\
-    Let $\tilde{M}$ be $M$ restricted to its largest connected component,
-    $C$ \\
-    $\ca{P}_1, \ldots, \ca{P}_k \leftarrow$
-    \texttt{RWSpectClust($\tilde{M},k,l$)} \\
-    \Return $\ca{P}_1, \ldots, \ca{P}_k$
-  }
+\caption{Motif-based random-walk spectral clustering}
+\label{alg:motifrwspectclust}
+
+\SetKwFunction{Main}{MotifRWSpectClust}
+\newcommand{\MainArgs}{$\ca{G},\mathcal{M},k,l$}
+
+\BlankLine
+\Input{Graph $\ca{G}$, motif $\ca{M}$, number of clusters $k$, dimension
+$l$}
+\Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$}
+\BlankLine
+
+\Function{\Main{\MainArgs}}{
+Construct the motif adjacency matrix $M$ of the graph $\ca{G}$ with
+motif $\ca{M}$ \\
+Let $\tilde{M}$ be $M$ restricted to its largest connected component,
+$C$ \\
+$\ca{P}_1, \ldots, \ca{P}_k \leftarrow$
+\texttt{RWSpectClust($\tilde{M},k,l$)} \\
+\Return $\ca{P}_1, \ldots, \ca{P}_k$
+}
 
 \end{algorithm}
 
@@ -994,18 +994,18 @@ \subsection{Symmetric two-block DSBMs}
 $n_1=n_2=n$ and
 $F =
 \begin{psmallmatrix}
-  p & q \\ q & p
+p & q \\ q & p
 \end{psmallmatrix}$
 where $p > q$. Figure~\ref{fig:sym_two_block_dsbm} illustrates the block
 structure and sparsity matrix of this model. Thicker lines indicate existence
 of edges with higher probability.
 
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.8,draft=false]{%
-  ../tikz/sym_two_block_dsbm/sym_two_block_dsbm.pdf}
-  \caption{Symmetric two-block DSBM block structure and sparsity matrix}
-  \label{fig:sym_two_block_dsbm}
+\centering
+\includegraphics[scale=0.8,draft=false]{%
+../tikz/sym_two_block_dsbm/sym_two_block_dsbm.pdf}
+\caption{Symmetric two-block DSBM block structure and sparsity matrix}
+\label{fig:sym_two_block_dsbm}
 \end{figure}
 
 We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various
@@ -1023,20 +1023,20 @@ \subsection{Symmetric two-block DSBMs}
 only cluster a subset of the vertices of $\ca{G}$.
 
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/motifsym/motifsym_1.pdf}
-    \caption{$n=50$, $p=0.3$, $q=0.2$}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/motifsym/motifsym_2.pdf}
-    \caption{$n=100$, $p=0.15$, $q=0.1$}
-  \end{subfigure}
-  \caption{ARI violin plots for the symmetric two-block DSBM}
-  \label{fig:motifsym}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/motifsym/motifsym_1.pdf}
+\caption{$n=50$, $p=0.3$, $q=0.2$}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/motifsym/motifsym_2.pdf}
+\caption{$n=100$, $p=0.15$, $q=0.1$}
+\end{subfigure}
+\caption{ARI violin plots for the symmetric two-block DSBM}
+\label{fig:motifsym}
 \end{figure}
 
 \subsection{Asymmetric two-block DSBMs} \label{sec:motif_asymm_dsbms}
@@ -1045,17 +1045,17 @@ \subsection{Asymmetric two-block DSBMs} \label{sec:motif_asymm_dsbms}
 $n_1=n_2=n$ and
 $F =
 \begin{psmallmatrix}
-  p & q_1 \\ q_2 & p
+p & q_1 \\ q_2 & p
 \end{psmallmatrix}$
 where $q_1 > q_2$ and $p = \frac{1}{2}(q_1+q_2)$.
 Figure~\ref{fig:asym_two_block_dsbm} shows this model.
 
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.8,draft=false]{%
-  ../tikz/asym_two_block_dsbm/asym_two_block_dsbm.pdf}
-  \caption{Asymmetric two-block DSBM block structure and sparsity matrix}
-  \label{fig:asym_two_block_dsbm}
+\centering
+\includegraphics[scale=0.8,draft=false]{%
+../tikz/asym_two_block_dsbm/asym_two_block_dsbm.pdf}
+\caption{Asymmetric two-block DSBM block structure and sparsity matrix}
+\label{fig:asym_two_block_dsbm}
 \end{figure}
 
 We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various
@@ -1074,20 +1074,20 @@ \subsection{Asymmetric two-block DSBMs} \label{sec:motif_asymm_dsbms}
 traditional method performs extremely poorly.
 
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/motifasym/motifasym_1.pdf}
-    \caption{$n=100$, $p=0.2$, $q_1=0.35$, $q_2=0.05$}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/motifasym/motifasym_2.pdf}
-    \caption{$n=200$, $p=0.15$, $q_1=0.25$, $q_2=0.05$}
-  \end{subfigure}
-  \caption{ARI violin plots for the asymmetric two-block DSBM}
-  \label{fig:motifasym}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/motifasym/motifasym_1.pdf}
+\caption{$n=100$, $p=0.2$, $q_1=0.35$, $q_2=0.05$}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/motifasym/motifasym_2.pdf}
+\caption{$n=200$, $p=0.15$, $q_1=0.25$, $q_2=0.05$}
+\end{subfigure}
+\caption{ARI violin plots for the asymmetric two-block DSBM}
+\label{fig:motifasym}
 \end{figure}
 
 \section{US Political Blogs network} \label{sec:motif_polblogs}
@@ -1114,21 +1114,21 @@ \section{US Political Blogs network} \label{sec:motif_polblogs}
 
 \vspace*{0.5cm}
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/polblogs/polblogs_network.pdf}
-    \caption{The US Political Blogs network}
-    \label{fig:polblogs_network}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/polblogs/polblogs_ari_conn.pdf}
-    \caption{ARI against $|C|$ across motifs}
-    \label{fig:polblogs_ariplot}
-  \end{subfigure}
-  \caption{Plots relating to the US Political Blogs network}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/polblogs/polblogs_network.pdf}
+\caption{The US Political Blogs network}
+\label{fig:polblogs_network}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/polblogs/polblogs_ari_conn.pdf}
+\caption{ARI against $|C|$ across motifs}
+\label{fig:polblogs_ariplot}
+\end{subfigure}
+\caption{Plots relating to the US Political Blogs network}
 \end{figure}
 
 Figure~\ref{fig:polblogs_embedding} shows the embedding given by eigenvectors 2
@@ -1142,22 +1142,22 @@ \section{US Political Blogs network} \label{sec:motif_polblogs}
 
 \vspace*{0.5cm}
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/polblogs/polblogs_M12_truth.pdf}
-    \caption{Colouring by truth label}
-    \label{fig:polblogs_embedding_truth}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/polblogs/polblogs_M12_clusts.pdf}
-    \caption{Colouring by $k$-means++ cluster}
-    \label{fig:polblogs_embedding_kmeans}
-  \end{subfigure}
-  \caption{Eigendecomposition embedding of the US Political Blogs network}
-  \label{fig:polblogs_embedding}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/polblogs/polblogs_M12_truth.pdf}
+\caption{Colouring by truth label}
+\label{fig:polblogs_embedding_truth}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/polblogs/polblogs_M12_clusts.pdf}
+\caption{Colouring by $k$-means++ cluster}
+\label{fig:polblogs_embedding_kmeans}
+\end{subfigure}
+\caption{Eigendecomposition embedding of the US Political Blogs network}
+\label{fig:polblogs_embedding}
 \end{figure}
 
 \pagebreak
@@ -1191,26 +1191,26 @@ \section{US Migration network} \label{sec:motif_migration}
 $\textrm{ARI}(\ca{M}_6, \ca{M}_9) = 0.73$.
 
 \begin{figure}[H]
-  \begin{subfigure}{.325\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/us_migration/us_migration_sweep_profile_Ms.pdf}
-    \caption{$\ca{M}_\mathrm{s}$}
-  \end{subfigure}
-  \begin{subfigure}{.325\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/us_migration/us_migration_sweep_profile_M6.pdf}
-    \caption{$\ca{M}_6$}
-  \end{subfigure}
-  \begin{subfigure}{.325\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/us_migration/us_migration_sweep_profile_M9.pdf}
-    \caption{$\ca{M}_9$}
-  \end{subfigure}
-  \caption{Sweep profiles of the US Migration network}
-  \label{fig:migration_sweep}
+\begin{subfigure}{.325\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/us_migration/us_migration_sweep_profile_Ms.pdf}
+\caption{$\ca{M}_\mathrm{s}$}
+\end{subfigure}
+\begin{subfigure}{.325\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/us_migration/us_migration_sweep_profile_M6.pdf}
+\caption{$\ca{M}_6$}
+\end{subfigure}
+\begin{subfigure}{.325\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/us_migration/us_migration_sweep_profile_M9.pdf}
+\caption{$\ca{M}_9$}
+\end{subfigure}
+\caption{Sweep profiles of the US Migration network}
+\label{fig:migration_sweep}
 \end{figure}
 
 Next, Figure~\ref{fig:us_migration} plots maps of the US, with counties
@@ -1254,16 +1254,16 @@ \section{US Migration network} \label{sec:motif_migration}
 
 \vspace*{-1cm}
 \begin{figure}[H]
-  \begin{table}[H]
-    \centering
-    \setlength{\tabcolsep}{0em}
-    \begin{tabular}{ |c|c|c|c| }
-      \expandableinput ../../results/us_migration/us_migration_table.txt
-    \end{tabular}
-  \end{table}
-  \vspace*{-0.5cm}
-  \caption{Motif-based colourings of the US Migration network}
-  \label{fig:us_migration}
+\begin{table}[H]
+\centering
+\setlength{\tabcolsep}{0em}
+\begin{tabular}{ |c|c|c|c| }
+\expandableinput ../../results/us_migration/us_migration_table.txt
+\end{tabular}
+\end{table}
+\vspace*{-0.5cm}
+\caption{Motif-based colourings of the US Migration network}
+\label{fig:us_migration}
 \end{figure}
 \clearpage{}
 \clearpage{}
@@ -1284,11 +1284,11 @@ \chapter{Bipartite Clustering} \label{chap:bipartite}
 \section{Bipartite graphs} \label{sec:bipartite_graphs}
 
 \begin{definition}
-  A \emph{bipartite graph} is a graph $\ca{G}=(\ca{V,E})$ where $\ca{V}$ can be
-  partitioned into $\ca{V} = \ca{S} \sqcup \ca{D}$ such that $\ca{E} \subseteq
-  \ca{S} \times \ca{D}$. That is, every edge starts in $\ca{S}$ and ends in
-  $\ca{D}$. We refer to $\ca{S}$ as the \emph{source vertices} and to $\ca{D}$
-  as the \emph{destination vertices}.
+A \emph{bipartite graph} is a graph $\ca{G}=(\ca{V,E})$ where $\ca{V}$ can be
+partitioned into $\ca{V} = \ca{S} \sqcup \ca{D}$ such that $\ca{E} \subseteq
+\ca{S} \times \ca{D}$. That is, every edge starts in $\ca{S}$ and ends in
+$\ca{D}$. We refer to $\ca{S}$ as the \emph{source vertices} and to $\ca{D}$
+as the \emph{destination vertices}.
 \end{definition}
 
 \subsection{Collider and expander motifs} \label{sec:coll_expa}
@@ -1299,10 +1299,10 @@ \subsection{Collider and expander motifs} \label{sec:coll_expa}
 \}$.
 
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.8,draft=false]{../tikz/expa_coll/expa_coll.pdf}
-  \caption{The collider and expander motifs}
-  \label{fig:expa_coll}
+\centering
+\includegraphics[scale=0.8,draft=false]{../tikz/expa_coll/expa_coll.pdf}
+\caption{The collider and expander motifs}
+\label{fig:expa_coll}
 \end{figure}
 
 These motifs are useful for bipartite clustering because of
@@ -1316,25 +1316,25 @@ \subsection{Collider and expander motifs} \label{sec:coll_expa}
 from) that neighbour.
 
 \begin{proposition}[Colliders and expanders in bipartite graphs]
-  \label{prop:coll_expa_formulae}
-  Let $\ca{G} = (\ca{V,E},W)$ be a directed bipartite graph. Let
-  $M_\mathrm{coll}$ and $M_\mathrm{expa}$ be the structural or functional MAMs
-  of $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$ respectively in
-  $\ca{G}$. Then
-  %
-  \begin{align*}
-    (M_\mathrm{coll})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm}
-    \sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm}
-    \frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,, &(1)\\
-    (M_\mathrm{expa})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm}
-    \sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}}
-    \hspace*{-0.2cm}\frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. &(2)
-  \end{align*}
-  %
+\label{prop:coll_expa_formulae}
+Let $\ca{G} = (\ca{V,E},W)$ be a directed bipartite graph. Let
+$M_\mathrm{coll}$ and $M_\mathrm{expa}$ be the structural or functional MAMs
+of $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$ respectively in
+$\ca{G}$. Then
+%
+\begin{align*}
+(M_\mathrm{coll})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm}
+\sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm}
+\frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,, &(1)\\
+(M_\mathrm{expa})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm}
+\sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}}
+\hspace*{-0.2cm}\frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. &(2)
+\end{align*}
+%
 \end{proposition}
 %
 \begin{proof}
-  See Proof~\ref{proof:coll_expa_formulae}.
+See Proof~\ref{proof:coll_expa_formulae}.
 \end{proof}
 
 \subsection{Bipartite spectral clustering algorithm}
@@ -1348,36 +1348,36 @@ \subsection{Bipartite spectral clustering algorithm}
 \vspace*{0.5cm}
 \begin{algorithm}[H]
 
-  \SetKwFunction{Main}{BipartiteRWSpectClust}
-  \newcommand{\MainArgs}{$\ca{G},k_\ca{S},k_\ca{D},l_\ca{S},l_\ca{D}$}
-
-  \BlankLine
-  \Input{Bipartite graph $\ca{G}$, source clusters $k_\ca{S}$, destination
-    clusters $k_\ca{D}$, source dimension $l_\ca{S}$, destination dimension
-  $l_\ca{D}$}
-  \Output{Source partition $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$, destination
-  partition $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$}
-  \BlankLine
-
-  \Function{\Main{\MainArgs}}{
-    Construct the collider motif adjacency matrix $M_\mathrm{coll}$ of the
-    graph $\ca{G}$ \\
-    Construct the expander motif adjacency matrix $M_\mathrm{expa}$ of the
-    graph $\ca{G}$ \\
-    $M_\mathrm{coll} \leftarrow M_\mathrm{coll}[\ca{S,S}]$ \Comm*{restrict rows
-    and columns of $M_\mathrm{coll}$ to $\ca{S}$ \hspace*{0.07cm}}
-    $M_\mathrm{expa} \leftarrow M_\mathrm{expa}[\ca{D,D}]$ \Comm*{restrict rows
-    and columns of $M_\mathrm{expa}$ to $\ca{D}$}
-    $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}} \leftarrow$
-    \texttt{RWSpectClust($M_\mathrm{coll},k_\ca{S},l_\ca{S}$)} \\
-    $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}} \leftarrow$
-    \texttt{RWSpectClust($M_\mathrm{expa},k_\ca{D},l_\ca{D}$)} \\
-    \Return $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$ and $\ca{D}_1, \ldots,
-    \ca{D}_{k_\ca{D}}$
-  }
-
-  \caption{Bipartite random walk spectral clustering}
-  \label{alg:bipartite_clustering}
+\SetKwFunction{Main}{BipartiteRWSpectClust}
+\newcommand{\MainArgs}{$\ca{G},k_\ca{S},k_\ca{D},l_\ca{S},l_\ca{D}$}
+
+\BlankLine
+\Input{Bipartite graph $\ca{G}$, source clusters $k_\ca{S}$, destination
+clusters $k_\ca{D}$, source dimension $l_\ca{S}$, destination dimension
+$l_\ca{D}$}
+\Output{Source partition $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$, destination
+partition $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$}
+\BlankLine
+
+\Function{\Main{\MainArgs}}{
+Construct the collider motif adjacency matrix $M_\mathrm{coll}$ of the
+graph $\ca{G}$ \\
+Construct the expander motif adjacency matrix $M_\mathrm{expa}$ of the
+graph $\ca{G}$ \\
+$M_\mathrm{coll} \leftarrow M_\mathrm{coll}[\ca{S,S}]$ \Comm*{restrict rows
+and columns of $M_\mathrm{coll}$ to $\ca{S}$ \hspace*{0.07cm}}
+$M_\mathrm{expa} \leftarrow M_\mathrm{expa}[\ca{D,D}]$ \Comm*{restrict rows
+and columns of $M_\mathrm{expa}$ to $\ca{D}$}
+$\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}} \leftarrow$
+\texttt{RWSpectClust($M_\mathrm{coll},k_\ca{S},l_\ca{S}$)} \\
+$\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}} \leftarrow$
+\texttt{RWSpectClust($M_\mathrm{expa},k_\ca{D},l_\ca{D}$)} \\
+\Return $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$ and $\ca{D}_1, \ldots,
+\ca{D}_{k_\ca{D}}$
+}
+
+\caption{Bipartite random walk spectral clustering}
+\label{alg:bipartite_clustering}
 \end{algorithm}
 
 \section{Bipartite stochastic block models} \label{sec:bipartite_sbms}
@@ -1386,8 +1386,8 @@ \section{Bipartite stochastic block models} \label{sec:bipartite_sbms}
 \cite{florescu2016spectral} as the DSBM with $k=4$, $n_1 = \dots = n_4=n$ and
 $F =
 \begin{psmallmatrix}
-  0 & 0 & p & q \\ 0 & 0 & q & p \\ 0 & 0 & 0 & 0 \\ 0
-  & 0 & 0 & 0
+0 & 0 & p & q \\ 0 & 0 & q & p \\ 0 & 0 & 0 & 0 \\ 0
+& 0 & 0 & 0
 \end{psmallmatrix}$
 where $p > q$. Figure~\ref{fig:bipartite_bsbm} illustrates the block structure
 and sparsity matrix of this model. This model partitions the source vertices as
@@ -1396,11 +1396,11 @@ \section{Bipartite stochastic block models} \label{sec:bipartite_sbms}
 $\ca{S}_1$ to $\ca{D}_1$ and from $\ca{S}_2$ to $\ca{D}_2$.
 
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.8,draft=false]{%
-  ../tikz/bipartite_dsbm/bipartite_dsbm.pdf}
-  \caption{BSBM block structure and sparsity matrix}
-  \label{fig:bipartite_bsbm}
+\centering
+\includegraphics[scale=0.8,draft=false]{%
+../tikz/bipartite_dsbm/bipartite_dsbm.pdf}
+\caption{BSBM block structure and sparsity matrix}
+\label{fig:bipartite_bsbm}
 \end{figure}
 
 We test the performance of Algorithm~\ref{alg:bipartite_clustering} with
@@ -1417,20 +1417,20 @@ \section{Bipartite stochastic block models} \label{sec:bipartite_sbms}
 vertices.
 
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/bipartite/bipartite1.pdf}
-    \caption{$n=100$, $p=0.2$, $q=0.1$}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/bipartite/bipartite2.pdf}
-    \caption{$n=200$, $p=0.1$, $q=0.06$}
-  \end{subfigure}
-  \caption{ARI violin plots for the BSBM}
-  \label{fig:bipartite}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/bipartite/bipartite1.pdf}
+\caption{$n=100$, $p=0.2$, $q=0.1$}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/bipartite/bipartite2.pdf}
+\caption{$n=200$, $p=0.1$, $q=0.06$}
+\end{subfigure}
+\caption{ARI violin plots for the BSBM}
+\label{fig:bipartite}
 \end{figure}
 
 \section{American Revolution network} \label{sec:bipartite_american_revolution}
@@ -1451,22 +1451,22 @@ \section{American Revolution network} \label{sec:bipartite_american_revolution}
 in clustering organisations based on their common members.
 
 \begin{figure}[H]
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/american_revolution/american_revolution_source.pdf}
-    \caption{Grouping people into 5 clusters}
-    \label{fig:bipartite_revolution_source}
-  \end{subfigure}
-  \begin{subfigure}{.49\textwidth}
-    \centering
-    \includegraphics[scale=0.4,draft=false]{%
-    ../../results/american_revolution/american_revolution_dest.pdf}
-    \caption{Grouping organisations into 2 clusters}
-    \label{fig:bipartite_revolution_dest}
-  \end{subfigure}
-  \caption{Bipartite clustering of the American Revolution network}
-  \label{fig:bipartite_revolution}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/american_revolution/american_revolution_source.pdf}
+\caption{Grouping people into 5 clusters}
+\label{fig:bipartite_revolution_source}
+\end{subfigure}
+\begin{subfigure}{.49\textwidth}
+\centering
+\includegraphics[scale=0.4,draft=false]{%
+../../results/american_revolution/american_revolution_dest.pdf}
+\caption{Grouping organisations into 2 clusters}
+\label{fig:bipartite_revolution_dest}
+\end{subfigure}
+\caption{Bipartite clustering of the American Revolution network}
+\label{fig:bipartite_revolution}
 \end{figure}
 
 \section{Unicode Languages network} \label{sec:bipartite_languages}
@@ -1508,20 +1508,20 @@ \section{Unicode Languages network} \label{sec:bipartite_languages}
 Norwegian Bokm{\aa}l and Norwegian Nynorsk.
 
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.6, draft=false]{%
-  ../../results/languages/languages_source_map_clusts.pdf}
-  \caption{Clustering the territories from the Unicode Languages network}
-  \label{fig:bipartite_languages_map}
+\centering
+\includegraphics[scale=0.6, draft=false]{%
+../../results/languages/languages_source_map_clusts.pdf}
+\caption{Clustering the territories from the Unicode Languages network}
+\label{fig:bipartite_languages_map}
 \end{figure}
 
 \begin{table}[H]
-  \centering
-  \scriptsize
-  \begin{tabular}{ |c|c|c|c|c|c| }
-    \hline
-    \rule{0pt}{1.2em}
-    \cellcolor[HTML]{8DD3C7} Cluster 1 &
+\centering
+\scriptsize
+\begin{tabular}{ |c|c|c|c|c|c| }
+\hline
+\rule{0pt}{1.2em}
+\cellcolor[HTML]{8DD3C7} Cluster 1 &
 \cellcolor[HTML]{FFFFB3} Cluster 2 &
 \cellcolor[HTML]{BEBADA} Cluster 3 &
 \cellcolor[HTML]{FB8072} Cluster 4 &
@@ -1555,11 +1555,11 @@ \section{Unicode Languages network} \label{sec:bipartite_languages}
 $|\textrm{Cluster\ } 4 |$ = 11 &
 $|\textrm{Cluster\ } 5 |$ = 10 &
 $|\textrm{Cluster\ } 6 |$ = 3
-  \\[0.1cm]
-    \hline
-  \end{tabular}
-  \caption{Clustering the territories from the Unicode Languages network}
-  \label{tab:bipartite_languages_source_clusters}
+\\[0.1cm]
+\hline
+\end{tabular}
+\caption{Clustering the territories from the Unicode Languages network}
+\label{tab:bipartite_languages_source_clusters}
 \end{table}
 
 For the destination vertices, we present the six clusters obtained by
@@ -1585,13 +1585,13 @@ \section{Unicode Languages network} \label{sec:bipartite_languages}
 
 \vspace*{0.5cm}
 \begin{table}[H]
-  \centering
-  \scriptsize
-  \begin{tabular}{ |c|c|c|c|c|c| }
-    \hline
-    \rule{0pt}{1.2em}
-    Cluster 1 & Cluster 2 & Cluster 3 & Cluster 4 &
-    Cluster 5 & Cluster 6 \\[0.1cm]
+\centering
+\scriptsize
+\begin{tabular}{ |c|c|c|c|c|c| }
+\hline
+\rule{0pt}{1.2em}
+Cluster 1 & Cluster 2 & Cluster 3 & Cluster 4 &
+Cluster 5 & Cluster 6 \\[0.1cm]
 \hline \rule{0pt}{1.2em}
 Spanish & English & Swahili & Indonesian & Chinese & Thai \\
 Arabic & Hindi & Kinyarwanda & Javanese & Wu Chinese & N.E.\ Thai \\
@@ -1620,11 +1620,11 @@ \section{Unicode Languages network} \label{sec:bipartite_languages}
 $|\textrm{Cluster\ } 4 |$ = 15 &
 $|\textrm{Cluster\ } 5 |$ = 13 &
 $|\textrm{Cluster\ } 6 |$ = 7
-  \\[0.1cm]
-    \hline
-  \end{tabular}
-  \caption{Clustering the languages from the Unicode Languages network}
-  \label{tab:bipartite_languages_dest_clusters}
+\\[0.1cm]
+\hline
+\end{tabular}
+\caption{Clustering the languages from the Unicode Languages network}
+\label{tab:bipartite_languages_dest_clusters}
 \end{table}
 
 \clearpage{}
@@ -1704,395 +1704,395 @@ \chapter{Proofs and Examples}\label{chap:appendix_proofs}
 \section{Proofs}
 
 \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_formula}, MAM formula]
-  \label{proof:motif_adj_matrix_formula}
-  %
-  Consider $(1)$. We sum over functional instances $\ca{M} \cong \ca{H} \leq
-  \ca{G}$ such that $\{i,j\} \in \ca{A(H)}$.
-  This is equivalent to summing over $\{k_2, \ldots, k_{m-1}\} \subseteq
-  \ca{V}$ and $\sigma \in S_\ca{M,A}^\sim$, such that $k_u$ are all distinct
-  and
-  %
-  $$ (u,v) \in \ca{E_M} \implies (k_{\sigma u}, k_{\sigma v}) \in \ca{E}\,.
-  \qquad (\dagger) $$
-  %
-  This is because the vertex set $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$
-  indicates which vertices are present in the instance $\ca{H}$, and $\sigma$
-  describes the mapping from $\ca{V_M}$ onto those vertices: $u \mapsto
-  k_{\sigma u}$. We take $\sigma \in S_\ca{M,A}^\sim$ to ensure that $\{i,j\}
-  \in \ca{A(H)}$ (since $i=k_1, \ j=k_m$), and that instances are counted
-  exactly once.
-  The condition $(\dagger)$ is to check that $\ca{H}$ is a functional instance
-  of $\ca{M}$ in $\ca{G}$. Hence
-  %
-  \begin{align*}
-    M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|}
-    \sum_{\ca{M} \cong \ca{H} \leq
-    \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in
-    \ca{E_H}} W(e) \\
-    %
-    &=  \frac{1}{|\ca{E_M}|} \sum_{\{ k_2, \ldots, k_{m-1} \}} \sum_{\sigma \in
-    S_\ca{M,A}^\sim} \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger)
-    \big\} \sum_{e \in \ca{E_H}} W(e)\,.
-  \end{align*}
-  %
-  For the first term, by conditioning on the types of edge in $\ca{E_M}$:
-  \begin{align*}
-    %
-    \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}
-    &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{ k_{\sigma u} \neq k_{\sigma v} \} \\
-    & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{ (k_{\sigma u},
-    k_{\sigma v}) \in \ca{E} \} \\
-    & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u},
-      k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in
-    \ca{E}\} \\
-    %
-    &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}
-    \\
-    %
-    &= J^\mathrm{func}_{\mathbf{k},\sigma}\,.
-    %
-  \end{align*}
-  %
-  Assuming $\big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}$, the second
-  term is
-  %
-  \begin{align*}
-    %
-    \sum_{e \in \ca{E_H}} W(e)
-    &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v}))
-    + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) +
-    W((k_{\sigma v},k_{\sigma u})) \big) \\
-    %
-    &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}}
-    + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}} \\
-    %
-    &= G^\mathrm{func}_{\mathbf{k},\sigma}
-  \end{align*}
-  %
-  as required. For $(2)$, we simply change $(\dagger)$ to $(\ddagger)$ to check
-  that an instance is a \emph{structural} instance:
-  %
-  $$ (u,v) \in \ca{E_M} \iff (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \qquad
-  (\ddagger) $$
-  %
-  Now for the first term:
-  %
-  \begin{align*}
-    %
-    \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}
-    &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \notin
-    \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\
-    & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{(k_{\sigma u},
-      k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u})
-      \notin
-    \ca{E}\} \\
-    & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u},
-      k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in
-    \ca{E}\} \\
-    %
-    &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{0})_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}
-    \\
-    %
-    &= J^\mathrm{struc}_{\mathbf{k},\sigma}\,.
-    %
-  \end{align*}
-  %
-  Assuming $\big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}$, the
-  second term is
-  %
-  \begin{align*}
-    %
-    \sum_{e \in \ca{E_H}} W(e)
-    &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v}))
-    + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) +
-    W((k_{\sigma v},k_{\sigma u})) \big) \\
-    %
-    &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma
-    v}}
-    + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}} \\
-    %
-    &= G^\mathrm{struc}_{\mathbf{k},\sigma}\,.
-  \end{align*}
-
-  \hfill $\square$
+\label{proof:motif_adj_matrix_formula}
+%
+Consider $(1)$. We sum over functional instances $\ca{M} \cong \ca{H} \leq
+\ca{G}$ such that $\{i,j\} \in \ca{A(H)}$.
+This is equivalent to summing over $\{k_2, \ldots, k_{m-1}\} \subseteq
+\ca{V}$ and $\sigma \in S_\ca{M,A}^\sim$, such that $k_u$ are all distinct
+and
+%
+$$ (u,v) \in \ca{E_M} \implies (k_{\sigma u}, k_{\sigma v}) \in \ca{E}\,.
+\qquad (\dagger) $$
+%
+This is because the vertex set $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$
+indicates which vertices are present in the instance $\ca{H}$, and $\sigma$
+describes the mapping from $\ca{V_M}$ onto those vertices: $u \mapsto
+k_{\sigma u}$. We take $\sigma \in S_\ca{M,A}^\sim$ to ensure that $\{i,j\}
+\in \ca{A(H)}$ (since $i=k_1, \ j=k_m$), and that instances are counted
+exactly once.
+The condition $(\dagger)$ is to check that $\ca{H}$ is a functional instance
+of $\ca{M}$ in $\ca{G}$. Hence
+%
+\begin{align*}
+M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|}
+\sum_{\ca{M} \cong \ca{H} \leq
+\ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in
+\ca{E_H}} W(e) \\
+%
+&=  \frac{1}{|\ca{E_M}|} \sum_{\{ k_2, \ldots, k_{m-1} \}} \sum_{\sigma \in
+S_\ca{M,A}^\sim} \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger)
+\big\} \sum_{e \in \ca{E_H}} W(e)\,.
+\end{align*}
+%
+For the first term, by conditioning on the types of edge in $\ca{E_M}$:
+\begin{align*}
+%
+\bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}
+&= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{ k_{\sigma u} \neq k_{\sigma v} \} \\
+& \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{ (k_{\sigma u},
+k_{\sigma v}) \in \ca{E} \} \\
+& \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u},
+k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in
+\ca{E}\} \\
+%
+&= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}
+\\
+%
+&= J^\mathrm{func}_{\mathbf{k},\sigma}\,.
+%
+\end{align*}
+%
+Assuming $\big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}$, the second
+term is
+%
+\begin{align*}
+%
+\sum_{e \in \ca{E_H}} W(e)
+&= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v}))
++ \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) +
+W((k_{\sigma v},k_{\sigma u})) \big) \\
+%
+&= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}}
++ \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}} \\
+%
+&= G^\mathrm{func}_{\mathbf{k},\sigma}
+\end{align*}
+%
+as required. For $(2)$, we simply change $(\dagger)$ to $(\ddagger)$ to check
+that an instance is a \emph{structural} instance:
+%
+$$ (u,v) \in \ca{E_M} \iff (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \qquad
+(\ddagger) $$
+%
+Now for the first term:
+%
+\begin{align*}
+%
+\bb{I} \big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}
+&= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \notin
+\ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\
+& \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{(k_{\sigma u},
+k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u})
+\notin
+\ca{E}\} \\
+& \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u},
+k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in
+\ca{E}\} \\
+%
+&= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{0})_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}
+\\
+%
+&= J^\mathrm{struc}_{\mathbf{k},\sigma}\,.
+%
+\end{align*}
+%
+Assuming $\big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}$, the
+second term is
+%
+\begin{align*}
+%
+\sum_{e \in \ca{E_H}} W(e)
+&= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v}))
++ \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) +
+W((k_{\sigma v},k_{\sigma u})) \big) \\
+%
+&= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma
+v}}
++ \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}} \\
+%
+&= G^\mathrm{struc}_{\mathbf{k},\sigma}\,.
+\end{align*}
+
+\hfill $\square$
 \end{prf}
 
 \pagebreak
 
 \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_computation},
-  Complexity of MAM formula]
-  \label{proof:motif_adj_matrix_computation}
-  Suppose ${m \leq 3}$ and consider $M^\mathrm{func}$. The adjacency and
-  indicator matrices of $\ca{G}$ are
-  %
-  \begin{equation*}
-    \begin{aligned}[c]
-      &(1) \quad J = \bb{I} \{ G>0 \}\,, \\
-      &(2) \quad J_0 = \bb{I} \{ G + G^\top = 0 \} \circ J_\mathrm{n}\,, \\
-      &(3) \quad J_\mathrm{s} = J - J_\mathrm{d}\,, \\
-      &(4) \quad G_\mathrm{d} = (G + G^\top) \circ J_\mathrm{d} \,,
-    \end{aligned}
-    \hspace*{2cm}
-    \begin{aligned}[c]
-      &(5) \quad J_\mathrm{n} = \bb{I} \{I_{n \times n} = 0 \}\,, \\
-      &(6) \quad J_\mathrm{d} = J \circ J^\top\,, \\
-      &(7) \quad G_\mathrm{s} = G \circ J_\mathrm{s}\,, \\
-      &
-    \end{aligned}
-  \end{equation*}
-  %
-  and are computed using four additions and four element-wise multiplications.
-  $J^\mathrm{func}_{\mathbf{k},\sigma}$ is a product of at most three factors,
-  and $G^\mathrm{func}_{\mathbf{k},\sigma}$ contains at most three summands, so
-  %
-  $$ \sum_{k_2 \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \
-  G^\mathrm{func}_{\mathbf{k},\sigma} $$
-  %
-  is expressible as a sum of at most three matrices, each of which is
-  constructed with at most one matrix multiplication (where $\{k_{\sigma
-  r},k_{\sigma s}\} \neq \{i,j\}$) and one entry-wise multiplication (where
-  $\{k_{\sigma r},k_{\sigma s}\} = \{i,j\}$). This is repeated for each $\sigma
-  \in S_\ca{M,A}^\sim$ (at most six times) and the results are summed.
-  Calculations are identical for $M^\mathrm{struc}$.
-
-  \hfill $\square$
+Complexity of MAM formula]
+\label{proof:motif_adj_matrix_computation}
+Suppose ${m \leq 3}$ and consider $M^\mathrm{func}$. The adjacency and
+indicator matrices of $\ca{G}$ are
+%
+\begin{equation*}
+\begin{aligned}[c]
+&(1) \quad J = \bb{I} \{ G>0 \}\,, \\
+&(2) \quad J_0 = \bb{I} \{ G + G^\top = 0 \} \circ J_\mathrm{n}\,, \\
+&(3) \quad J_\mathrm{s} = J - J_\mathrm{d}\,, \\
+&(4) \quad G_\mathrm{d} = (G + G^\top) \circ J_\mathrm{d} \,,
+\end{aligned}
+\hspace*{2cm}
+\begin{aligned}[c]
+&(5) \quad J_\mathrm{n} = \bb{I} \{I_{n \times n} = 0 \}\,, \\
+&(6) \quad J_\mathrm{d} = J \circ J^\top\,, \\
+&(7) \quad G_\mathrm{s} = G \circ J_\mathrm{s}\,, \\
+&
+\end{aligned}
+\end{equation*}
+%
+and are computed using four additions and four element-wise multiplications.
+$J^\mathrm{func}_{\mathbf{k},\sigma}$ is a product of at most three factors,
+and $G^\mathrm{func}_{\mathbf{k},\sigma}$ contains at most three summands, so
+%
+$$ \sum_{k_2 \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \
+G^\mathrm{func}_{\mathbf{k},\sigma} $$
+%
+is expressible as a sum of at most three matrices, each of which is
+constructed with at most one matrix multiplication (where $\{k_{\sigma
+r},k_{\sigma s}\} \neq \{i,j\}$) and one entry-wise multiplication (where
+$\{k_{\sigma r},k_{\sigma s}\} = \{i,j\}$). This is repeated for each $\sigma
+\in S_\ca{M,A}^\sim$ (at most six times) and the results are summed.
+Calculations are identical for $M^\mathrm{struc}$.
+
+\hfill $\square$
 \end{prf}
 
 \begin{prf}[Proposition~\ref{prop:coll_expa_formulae},
-  Colliders and expanders in bipartite graphs]
-  \label{proof:coll_expa_formulae}
-  %
-  Consider (1) and the collider motif $\ca{M}_\mathrm{coll}$. Since $\ca{G}$ is
-  bipartite, $M_\mathrm{coll}^\mathrm{func} = M_\mathrm{coll}^\mathrm{struc} =
-  \vcentcolon M_\mathrm{coll}$, and by Table~\ref{tab:motif_adj_mat_table},
-  $M_\mathrm{coll} = \frac{1}{2} J_\mathrm{n} \circ (J G^\top + G J^\top)$.
-  Hence
-  %
-  \begin{align*}
-    (M_\mathrm{coll})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J G^\top + G
-    J^\top)_{i j} \\
-    &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \Big(J_{i k} G_{j k}
-    + G_{i k} J_{j k} \Big) \\
-    &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \,\bb{I} \, \Big\{
-    (i,k),(j,k) \in \ca{E} \Big\} \Big[W((i,k)) + W((j,k))\Big] \\
-    &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\
-    (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) +
-    W((j,k)) \Big]\,.
-  \end{align*}
-  %
-  Similarly for the expander motif, $M_\mathrm{expa} = \frac{1}{2} J_\mathrm{n}
-  \circ (J^\top G + G^\top J)$ so
-  %
-  \begin{align*}
-    (M_\mathrm{expa})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J^\top G +
-    G^\top J)_{i j} \\
-    &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\
-    (k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((k,i)) +
-    W((k,j)) \Big]\,.
-  \end{align*}
-  %
-  \hfill $\square$
+Colliders and expanders in bipartite graphs]
+\label{proof:coll_expa_formulae}
+%
+Consider (1) and the collider motif $\ca{M}_\mathrm{coll}$. Since $\ca{G}$ is
+bipartite, $M_\mathrm{coll}^\mathrm{func} = M_\mathrm{coll}^\mathrm{struc} =
+\vcentcolon M_\mathrm{coll}$, and by Table~\ref{tab:motif_adj_mat_table},
+$M_\mathrm{coll} = \frac{1}{2} J_\mathrm{n} \circ (J G^\top + G J^\top)$.
+Hence
+%
+\begin{align*}
+(M_\mathrm{coll})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J G^\top + G
+J^\top)_{i j} \\
+&= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \Big(J_{i k} G_{j k}
++ G_{i k} J_{j k} \Big) \\
+&= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \,\bb{I} \, \Big\{
+(i,k),(j,k) \in \ca{E} \Big\} \Big[W((i,k)) + W((j,k))\Big] \\
+&= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\
+(i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) +
+W((j,k)) \Big]\,.
+\end{align*}
+%
+Similarly for the expander motif, $M_\mathrm{expa} = \frac{1}{2} J_\mathrm{n}
+\circ (J^\top G + G^\top J)$ so
+%
+\begin{align*}
+(M_\mathrm{expa})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J^\top G +
+G^\top J)_{i j} \\
+&= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\
+(k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((k,i)) +
+W((k,j)) \Big]\,.
+\end{align*}
+%
+\hfill $\square$
 \end{prf}
 
 \section{Examples}
 
 \begin{example}[Functional and structural instances]
-  \label{ex:instances}
-  Let $\ca{G}=(\ca{V,E})$ be the graph with $\ca{V} = \{ 1,2,3,4 \}$ and
-  $\ca{E} = \{ (1,2),(1,3),(1,4),(2,3),(3,4),(4,3) \}$. Let $(\ca{M,A})$ be the
-  anchored motif with $\ca{V_M} = \{1,2,3\}$, $\ca{E_M} =
-  \{(1,2),(1,3),(2,3)\}$ and $\ca{A} = \{1,3\}$ as defined in Figure
-  \ref{fig:instance_example_1}.
-  %
-  \begin{figure}[H]
-    \centering
-    \includegraphics[scale=0.7,draft=false]{%
-    ../tikz/instance_example_1/instance_example_1.pdf}
-    \caption{The specified graph $\ca{G}$ and anchored motif $\ca{M}$}
-    \label{fig:instance_example_1}
-  \end{figure}
-  %
-  There are three functional instances of $\ca{M}$ in $\ca{G}$, shown in
-  Figure~\ref{fig:instance_example_2}. However there is just one structural
-  instance of $\ca{M}$ in $\ca{G}$, given by $\ca{H}_1$. This is because the
-  double edge $3 \leftrightarrow 4$ in $\ca{G}$ prevents the subgraphs on
-  $\{1,3,4\}$ from being induced subgraphs.
-  %
-  \begin{align*}
-    \ca{H}_1 &: \quad \ca{V}_1 = \{ 1,2,3 \} ; \quad \ca{E}_1 = \{ (1,2) ,
-    (2,3) , (1,3) \} ; \quad \ca{A(H}_1) =  \big\{\{1,3\}\big\}\,, \\
-    \ca{H}_2 &: \quad \ca{V}_2 = \{ 1,3,4 \} ; \quad \ca{E}_2 = \{ (1,3) ,
-    (1,4) , (3,4) \} ; \quad \ca{A(H}_2) =  \big\{\{1,4\}\big\}\,, \\
-    \ca{H}_3 &: \quad \ca{V}_3 = \{ 1,3,4 \} ; \quad \ca{E}_3 = \{ (1,3) ,
-    (1,4) , (4,3) \} ; \quad \ca{A(H}_3) =  \big\{\{1,3\}\big\}\,.
-  \end{align*}
-  %
-  \begin{figure}[H]
-    \centering
-    \includegraphics[scale=0.7,draft=false]{%
-    ../tikz/instance_example_2/instance_example_2.pdf}
-    \caption{Functional instances $\ca{H}_1,\ca{H}_2$ and $\ca{H}_3$}
-    \label{fig:instance_example_2}
-  \end{figure}
+\label{ex:instances}
+Let $\ca{G}=(\ca{V,E})$ be the graph with $\ca{V} = \{ 1,2,3,4 \}$ and
+$\ca{E} = \{ (1,2),(1,3),(1,4),(2,3),(3,4),(4,3) \}$. Let $(\ca{M,A})$ be the
+anchored motif with $\ca{V_M} = \{1,2,3\}$, $\ca{E_M} =
+\{(1,2),(1,3),(2,3)\}$ and $\ca{A} = \{1,3\}$ as defined in Figure
+\ref{fig:instance_example_1}.
+%
+\begin{figure}[H]
+\centering
+\includegraphics[scale=0.7,draft=false]{%
+../tikz/instance_example_1/instance_example_1.pdf}
+\caption{The specified graph $\ca{G}$ and anchored motif $\ca{M}$}
+\label{fig:instance_example_1}
+\end{figure}
+%
+There are three functional instances of $\ca{M}$ in $\ca{G}$, shown in
+Figure~\ref{fig:instance_example_2}. However there is just one structural
+instance of $\ca{M}$ in $\ca{G}$, given by $\ca{H}_1$. This is because the
+double edge $3 \leftrightarrow 4$ in $\ca{G}$ prevents the subgraphs on
+$\{1,3,4\}$ from being induced subgraphs.
+%
+\begin{align*}
+\ca{H}_1 &: \quad \ca{V}_1 = \{ 1,2,3 \} ; \quad \ca{E}_1 = \{ (1,2) ,
+(2,3) , (1,3) \} ; \quad \ca{A(H}_1) =  \big\{\{1,3\}\big\}\,, \\
+\ca{H}_2 &: \quad \ca{V}_2 = \{ 1,3,4 \} ; \quad \ca{E}_2 = \{ (1,3) ,
+(1,4) , (3,4) \} ; \quad \ca{A(H}_2) =  \big\{\{1,4\}\big\}\,, \\
+\ca{H}_3 &: \quad \ca{V}_3 = \{ 1,3,4 \} ; \quad \ca{E}_3 = \{ (1,3) ,
+(1,4) , (4,3) \} ; \quad \ca{A(H}_3) =  \big\{\{1,3\}\big\}\,.
+\end{align*}
+%
+\begin{figure}[H]
+\centering
+\includegraphics[scale=0.7,draft=false]{%
+../tikz/instance_example_2/instance_example_2.pdf}
+\caption{Functional instances $\ca{H}_1,\ca{H}_2$ and $\ca{H}_3$}
+\label{fig:instance_example_2}
+\end{figure}
 
 \end{example}
 
 \begin{example}[Motif adjacency matrices]
-  \label{ex:motif_adj_matrices}
-  Let $\ca{G}$ and $\ca{(M,A)}$ be as in Example~\ref{ex:instances}, and
-  suppose $\ca{G}$ has weight map $W((i,j)) \vcentcolon = i + j$. Then using
-  Definition~\ref{def:motif_adj_matrices} directly, the functional and
-  structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are respectively
-
-  \vspace*{0.2cm}
-  $$ %
-  M^\mathrm{func} =
-  \begin{pmatrix}
-    0  & 0  & 28 & 16 \\
-    0  & 0  & 0  & 0  \\
-    28 & 0  & 0  & 0  \\
-    16 & 0  & 0  & 0
-  \end{pmatrix} \,,
-  \qquad
-  M^\mathrm{struc} =
-  \begin{pmatrix}
-    0  & 0  & 12 & 0  \\
-    0  & 0  & 0  & 0  \\
-    12 & 0  & 0  & 0  \\
-    0  & 0  & 0  & 0
-  \end{pmatrix}\,.
-  $$
+\label{ex:motif_adj_matrices}
+Let $\ca{G}$ and $\ca{(M,A)}$ be as in Example~\ref{ex:instances}, and
+suppose $\ca{G}$ has weight map $W((i,j)) \vcentcolon = i + j$. Then using
+Definition~\ref{def:motif_adj_matrices} directly, the functional and
+structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are respectively
+
+\vspace*{0.2cm}
+$$ %
+M^\mathrm{func} =
+\begin{pmatrix}
+0  & 0  & 28 & 16 \\
+0  & 0  & 0  & 0  \\
+28 & 0  & 0  & 0  \\
+16 & 0  & 0  & 0
+\end{pmatrix} \,,
+\qquad
+M^\mathrm{struc} =
+\begin{pmatrix}
+0  & 0  & 12 & 0  \\
+0  & 0  & 0  & 0  \\
+12 & 0  & 0  & 0  \\
+0  & 0  & 0  & 0
+\end{pmatrix}\,.
+$$
 \end{example}
 
 \pagebreak
 
 \begin{example}[Calculating an explicit formula for an MAM]
-  \label{ex:motif_adj_calc}
-  Consider the functional MAM of the simple motif $\ca{M}_6$
-  (Figure~\ref{fig:M6}).
-  %
-  \begin{figure}[H]
-    \centering
-    \includegraphics[scale=0.7,draft=false]{../tikz/M6/M6.pdf}
-    \caption{The motif $\ca{M}_6$}
-    \label{fig:M6}
-  \end{figure}
-  %
-  We use Equation (1) in Proposition~\ref{prop:motif_adj_matrix_formula}.
-  Firstly, $m = |\ca{V_M}| = 3$ and $|\ca{E_M}| = 4$. The automorphism group of
-  $\ca{M}_6$ has order 2, corresponding to swapping vertices 1 and 3. Hence
-  $|S_\ca{M,A}^\sim| = |S_m| / 2 = 6/2 = 3$, and suitable representatives from
-  $S_\ca{M,A}^\sim$ are
-
-  $$ S_\ca{M,A}^\sim = \left\{
-    %
-    \sigma_1 =
-    \begin{pmatrix}
-      1 & 2 & 3 \\
-      1 & 2 & 3
-    \end{pmatrix},
-    %
-    \sigma_2 =
-    \begin{pmatrix}
-      1 & 2 & 3 \\
-      2 & 1 & 3
-    \end{pmatrix},
-    %
-    \sigma_3 =
-    \begin{pmatrix}
-      1 & 2 & 3 \\
-      1 & 3 & 2
-    \end{pmatrix}
-  \right\}\,. \vspace*{0.2cm}$$
-  %
-  So by Proposition~\ref{prop:motif_adj_matrix_formula}, with $i=k_1$ and
-  $j=k_3$, and writing $k$ for $k_2$:
-
-  $$
-  M^\mathrm{func}_{i j} = \frac{1}{4} \sum_{\sigma \in S_\ca{M,A}^\sim} \
-  \sum_{k \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \
-  G^\mathrm{func}_{\mathbf{k},\sigma}
-  $$
-  %
-  where since there are no missing edges in $\ca{M}_6$:
-  %
-  \begin{align*}
-    %
-    J^\mathrm{func}_{\mathbf{k},\sigma}
-    &= \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
-    \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}}\,, \\
-    %
-    G^\mathrm{func}_{\mathbf{k},\sigma}
-    &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}}
-    + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
-    v}}\,.
-    %
-  \end{align*}
-  %
-  Writing out the sum over $\sigma$:
-  %
-  \begingroup
-  \allowdisplaybreaks
-  \begin{align*}
-    M^\mathrm{func}_{i j}
-    &= \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_1} \
-    G^\mathrm{func}_{\mathbf{k},\sigma_1} + \frac{1}{4} \sum_{k=1}^n
-    J^\mathrm{func}_{\mathbf{k},\sigma_2} \
-    G^\mathrm{func}_{\mathbf{k},\sigma_2} + \frac{1}{4} \sum_{k=1}^n
-    J^\mathrm{func}_{\mathbf{k},\sigma_3} \
-    G^\mathrm{func}_{\mathbf{k},\sigma_3} \\
-    %
-    &=         \frac{1}{4} \sum_{k=1}^n J_{j i} J_{j k} (J_\mathrm{d})_{i k}
-    \big(G_{j i} + G_{j k} + (G_\mathrm{d})_{i k}\big) \\
-    & \qquad + \frac{1}{4} \sum_{k=1}^n J_{i j} J_{i k} (J_\mathrm{d})_{j k}
-    \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{j k}\big) \\
-    & \qquad + \frac{1}{4} \sum_{k=1}^n J_{k i} J_{k j} (J_\mathrm{d})_{i j}
-    \big(G_{k i} + G_{k j} + (G_\mathrm{d})_{i j}\big) \\
-    %
-    & \\
-    & \\
-    & \\
-    &=         \frac{1}{4} J^\top_{i j} \sum_{k=1}^n (J_\mathrm{d})_{i k}
-    J^\top_{k j} \big(G^\top_{i j} + (G_\mathrm{d})_{i k} + G^\top_{k j}\big) \\
-    & \qquad + \frac{1}{4} J_{i j} \sum_{k=1}^n J_{i k}
-    (J_\mathrm{d})_{k j} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{k j}\big) \\
-    & \qquad + \frac{1}{4} (J_\mathrm{d})_{i j}
-    \sum_{k=1}^n J^\top_{i k} J_{k j}
-    \big((G_\mathrm{d})_{i j} + G^\top_{i k} + G_{k j}\big) \,,
-  \end{align*}
-  \endgroup
-  %
-  and writing this as a sum of entry-wise and matrix products:
-  %
-  \begin{align*}
-    M^\textrm{func} &= \frac{1}{4} \Big[ J^\top \circ (J_\mathrm{d} G^\top) +
-      J^\top \circ (G_\mathrm{d} J^\top) + G^\top \circ (J_\mathrm{d} J^\top)
-    \Big] \\
-    & \qquad + \frac{1}{4} \Big[ J \circ (J G_\mathrm{d}) + J \circ (G
-    J_\mathrm{d}) + G \circ (J J_\mathrm{d}) \Big] \\
-    & \qquad + \frac{1}{4} \Big[ J_\mathrm{d} \circ (J^\top G) + J_\mathrm{d}
-    \circ (G^\top J) + G_\mathrm{d} \circ (J^\top J) \Big]
-  \end{align*}
-  %
-  where $A \circ B$ is an entry-wise product and $AB$ is a matrix product.
-  Finally, setting
-  $$C = J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J
-  J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)\,, $$
-  and
-  $$ C' = G_\mathrm{d} \circ (J^\top J)\,, $$
-  then we have that
-  $$ M^\mathrm{func} = \frac{1}{4} \big(C + C^\top + C' \big)\,. $$
-  as in Table~\ref{tab:motif_adj_mat_table}, achieved with just five matrix
-  multiplications, nine entry-wise multiplications and nine matrix additions
-  (including the four entry-wise multiplications and four additions needed to
-  construct the adjacency and indicator matrices).
+\label{ex:motif_adj_calc}
+Consider the functional MAM of the simple motif $\ca{M}_6$
+(Figure~\ref{fig:M6}).
+%
+\begin{figure}[H]
+\centering
+\includegraphics[scale=0.7,draft=false]{../tikz/M6/M6.pdf}
+\caption{The motif $\ca{M}_6$}
+\label{fig:M6}
+\end{figure}
+%
+We use Equation (1) in Proposition~\ref{prop:motif_adj_matrix_formula}.
+Firstly, $m = |\ca{V_M}| = 3$ and $|\ca{E_M}| = 4$. The automorphism group of
+$\ca{M}_6$ has order 2, corresponding to swapping vertices 1 and 3. Hence
+$|S_\ca{M,A}^\sim| = |S_m| / 2 = 6/2 = 3$, and suitable representatives from
+$S_\ca{M,A}^\sim$ are
+
+$$ S_\ca{M,A}^\sim = \left\{
+%
+\sigma_1 =
+\begin{pmatrix}
+1 & 2 & 3 \\
+1 & 2 & 3
+\end{pmatrix},
+%
+\sigma_2 =
+\begin{pmatrix}
+1 & 2 & 3 \\
+2 & 1 & 3
+\end{pmatrix},
+%
+\sigma_3 =
+\begin{pmatrix}
+1 & 2 & 3 \\
+1 & 3 & 2
+\end{pmatrix}
+\right\}\,. \vspace*{0.2cm}$$
+%
+So by Proposition~\ref{prop:motif_adj_matrix_formula}, with $i=k_1$ and
+$j=k_3$, and writing $k$ for $k_2$:
+
+$$
+M^\mathrm{func}_{i j} = \frac{1}{4} \sum_{\sigma \in S_\ca{M,A}^\sim} \
+\sum_{k \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \
+G^\mathrm{func}_{\mathbf{k},\sigma}
+$$
+%
+where since there are no missing edges in $\ca{M}_6$:
+%
+\begin{align*}
+%
+J^\mathrm{func}_{\mathbf{k},\sigma}
+&= \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}}
+\prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}}\,, \\
+%
+G^\mathrm{func}_{\mathbf{k},\sigma}
+&= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}}
++ \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma
+v}}\,.
+%
+\end{align*}
+%
+Writing out the sum over $\sigma$:
+%
+\begingroup
+\allowdisplaybreaks
+\begin{align*}
+M^\mathrm{func}_{i j}
+&= \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_1} \
+G^\mathrm{func}_{\mathbf{k},\sigma_1} + \frac{1}{4} \sum_{k=1}^n
+J^\mathrm{func}_{\mathbf{k},\sigma_2} \
+G^\mathrm{func}_{\mathbf{k},\sigma_2} + \frac{1}{4} \sum_{k=1}^n
+J^\mathrm{func}_{\mathbf{k},\sigma_3} \
+G^\mathrm{func}_{\mathbf{k},\sigma_3} \\
+%
+&=         \frac{1}{4} \sum_{k=1}^n J_{j i} J_{j k} (J_\mathrm{d})_{i k}
+\big(G_{j i} + G_{j k} + (G_\mathrm{d})_{i k}\big) \\
+& \qquad + \frac{1}{4} \sum_{k=1}^n J_{i j} J_{i k} (J_\mathrm{d})_{j k}
+\big(G_{i j} + G_{i k} + (G_\mathrm{d})_{j k}\big) \\
+& \qquad + \frac{1}{4} \sum_{k=1}^n J_{k i} J_{k j} (J_\mathrm{d})_{i j}
+\big(G_{k i} + G_{k j} + (G_\mathrm{d})_{i j}\big) \\
+%
+& \\
+& \\
+& \\
+&=         \frac{1}{4} J^\top_{i j} \sum_{k=1}^n (J_\mathrm{d})_{i k}
+J^\top_{k j} \big(G^\top_{i j} + (G_\mathrm{d})_{i k} + G^\top_{k j}\big) \\
+& \qquad + \frac{1}{4} J_{i j} \sum_{k=1}^n J_{i k}
+(J_\mathrm{d})_{k j} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{k j}\big) \\
+& \qquad + \frac{1}{4} (J_\mathrm{d})_{i j}
+\sum_{k=1}^n J^\top_{i k} J_{k j}
+\big((G_\mathrm{d})_{i j} + G^\top_{i k} + G_{k j}\big) \,,
+\end{align*}
+\endgroup
+%
+and writing this as a sum of entry-wise and matrix products:
+%
+\begin{align*}
+M^\textrm{func} &= \frac{1}{4} \Big[ J^\top \circ (J_\mathrm{d} G^\top) +
+J^\top \circ (G_\mathrm{d} J^\top) + G^\top \circ (J_\mathrm{d} J^\top)
+\Big] \\
+& \qquad + \frac{1}{4} \Big[ J \circ (J G_\mathrm{d}) + J \circ (G
+J_\mathrm{d}) + G \circ (J J_\mathrm{d}) \Big] \\
+& \qquad + \frac{1}{4} \Big[ J_\mathrm{d} \circ (J^\top G) + J_\mathrm{d}
+\circ (G^\top J) + G_\mathrm{d} \circ (J^\top J) \Big]
+\end{align*}
+%
+where $A \circ B$ is an entry-wise product and $AB$ is a matrix product.
+Finally, setting
+$$C = J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J
+J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)\,, $$
+and
+$$ C' = G_\mathrm{d} \circ (J^\top J)\,, $$
+then we have that
+$$ M^\mathrm{func} = \frac{1}{4} \big(C + C^\top + C' \big)\,. $$
+as in Table~\ref{tab:motif_adj_mat_table}, achieved with just five matrix
+multiplications, nine entry-wise multiplications and nine matrix additions
+(including the four entry-wise multiplications and four additions needed to
+construct the adjacency and indicator matrices).
 \end{example}
 \clearpage{}
 \clearpage{}
@@ -2109,150 +2109,150 @@ \chapter{Motif Adjacency Matrix Formulae}
 \vspace*{0.2cm}
 \begin{table}[H]
 
-  \centering
-  \renewcommand{\arraystretch}{1.8}
-  \tiny
+\centering
+\renewcommand{\arraystretch}{1.8}
+\tiny
 
-  \begin{tabular}{ |c|c|c|c| }
+\begin{tabular}{ |c|c|c|c| }
 
-    \hline
+\hline
 
-    Motif & $C$ & $C'$ & $M^\mathrm{func}$ \\
+Motif & $C$ & $C'$ & $M^\mathrm{func}$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_\mathrm{s}$ & & & $G + G^\top$ \\
+$\ca{M}_\mathrm{s}$ & & & $G + G^\top$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_\mathrm{d}$ & & & $\frac{1}{2} G_\mathrm{d}$ \\
+$\ca{M}_\mathrm{d}$ & & & $\frac{1}{2} G_\mathrm{d}$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_1$ & $J^\top \circ (J G) + J^\top \circ (G J) + G^\top \circ (J J)$
-    & & $\frac{1}{3} \big(C + C^\top\big)$ \\
+$\ca{M}_1$ & $J^\top \circ (J G) + J^\top \circ (G J) + G^\top \circ (J J)$
+& & $\frac{1}{3} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_2$ & \rule{0pt}{2.7em}$\displaystyle
-    \begin{aligned}
-      & J^\top \circ (J_\mathrm{d} G) + J^\top \circ (G_\mathrm{d} J) + G^\top
-      \circ (J_\mathrm{d} J) \\
-      & + J^\top \circ (J G_\mathrm{d}) + J^\top \circ (G J_\mathrm{d}) +
-      G^\top \circ (J J_\mathrm{d}) \\
-      & + J_\mathrm{d} \circ (J G) + J_\mathrm{d} \circ (G J) + G_\mathrm{d}
-      \circ (J J)
-    \end{aligned}
-    $\rule[-2em]{0pt}{1em} & & $\frac{1}{4} \big(C + C^\top\big)$ \\
+$\ca{M}_2$ & \rule{0pt}{2.7em}$\displaystyle
+\begin{aligned}
+& J^\top \circ (J_\mathrm{d} G) + J^\top \circ (G_\mathrm{d} J) + G^\top
+\circ (J_\mathrm{d} J) \\
+& + J^\top \circ (J G_\mathrm{d}) + J^\top \circ (G J_\mathrm{d}) +
+G^\top \circ (J J_\mathrm{d}) \\
+& + J_\mathrm{d} \circ (J G) + J_\mathrm{d} \circ (G J) + G_\mathrm{d}
+\circ (J J)
+\end{aligned}
+$\rule[-2em]{0pt}{1em} & & $\frac{1}{4} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_3$ & \rule{0pt}{2.7em}$\displaystyle
-    \begin{aligned}
-      & J \circ (J_\mathrm{d} G_\mathrm{d}) + J \circ (G_\mathrm{d}
-      J_\mathrm{d}) + G \circ (J_\mathrm{d} J_\mathrm{d}) \\
-      & + J_\mathrm{d} \circ (J_\mathrm{d} G) + J_\mathrm{d} \circ
-      (G_\mathrm{d} J) + G_\mathrm{d} \circ (J_\mathrm{d} J) \\
-      & + J_\mathrm{d} \circ (J G_\mathrm{d}) + J_\mathrm{d} \circ (G
-      J_\mathrm{d}) + G_\mathrm{d} \circ (J J_\mathrm{d})
-    \end{aligned}
-    $\rule[-2em]{0pt}{1em} & & $\frac{1}{5} \big(C + C^\top\big)$ \\
+$\ca{M}_3$ & \rule{0pt}{2.7em}$\displaystyle
+\begin{aligned}
+& J \circ (J_\mathrm{d} G_\mathrm{d}) + J \circ (G_\mathrm{d}
+J_\mathrm{d}) + G \circ (J_\mathrm{d} J_\mathrm{d}) \\
+& + J_\mathrm{d} \circ (J_\mathrm{d} G) + J_\mathrm{d} \circ
+(G_\mathrm{d} J) + G_\mathrm{d} \circ (J_\mathrm{d} J) \\
+& + J_\mathrm{d} \circ (J G_\mathrm{d}) + J_\mathrm{d} \circ (G
+J_\mathrm{d}) + G_\mathrm{d} \circ (J J_\mathrm{d})
+\end{aligned}
+$\rule[-2em]{0pt}{1em} & & $\frac{1}{5} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_4$ & $ J_\mathrm{d} \circ (J_\mathrm{d} G_\mathrm{d}) +
-    J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{d}) + G_\mathrm{d} \circ
-    (J_\mathrm{d} J_\mathrm{d}) $ & & $ \frac{1}{6} C$ \\
+$\ca{M}_4$ & $ J_\mathrm{d} \circ (J_\mathrm{d} G_\mathrm{d}) +
+J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{d}) + G_\mathrm{d} \circ
+(J_\mathrm{d} J_\mathrm{d}) $ & & $ \frac{1}{6} C$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_5$ & \rule{0pt}{2.7em}$\displaystyle
-    \begin{aligned}
-      & J \circ (J G) + J \circ (G J) + G \circ (J J) \\
-      & + J \circ (J G^\top) + J \circ (G J^\top) + G \circ (J J^\top) \\
-      & + J \circ (J^\top G) + J \circ (G^\top J) + G \circ (J^\top J)
-    \end{aligned}
-    $\rule[-2em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\
+$\ca{M}_5$ & \rule{0pt}{2.7em}$\displaystyle
+\begin{aligned}
+& J \circ (J G) + J \circ (G J) + G \circ (J J) \\
+& + J \circ (J G^\top) + J \circ (G J^\top) + G \circ (J J^\top) \\
+& + J \circ (J^\top G) + J \circ (G^\top J) + G \circ (J^\top J)
+\end{aligned}
+$\rule[-2em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_6$ & $J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ
-    (J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)$ & $G_\mathrm{d} \circ
-    (J^\top J)$ & $\frac{1}{4} \big(C + C^\top + C' \big)$ \\
+$\ca{M}_6$ & $J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ
+(J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)$ & $G_\mathrm{d} \circ
+(J^\top J)$ & $\frac{1}{4} \big(C + C^\top + C' \big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_7$ & $J \circ (J_\mathrm{d} G) + J \circ (G_\mathrm{d} J) + G \circ
-    (J_\mathrm{d} J)$ & $J_\mathrm{d} \circ (J G^\top) + J_\mathrm{d} \circ (G
-    J^\top) + G_\mathrm{d} \circ (J J^\top)$ & $ \frac{1}{4} \big(C + C^\top +
-    C' \big)$ \\
+$\ca{M}_7$ & $J \circ (J_\mathrm{d} G) + J \circ (G_\mathrm{d} J) + G \circ
+(J_\mathrm{d} J)$ & $J_\mathrm{d} \circ (J G^\top) + J_\mathrm{d} \circ (G
+J^\top) + G_\mathrm{d} \circ (J J^\top)$ & $ \frac{1}{4} \big(C + C^\top +
+C' \big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_8$ & $J \circ (G J_\mathrm{n}) + G \circ (J J_\mathrm{n})$ &
-    $J_\mathrm{n} \circ (J^\top G) + J_\mathrm{n} \circ (G^\top J)$ &
-    $\frac{1}{2} \big(C + C^\top + C' \big)$ \\
+$\ca{M}_8$ & $J \circ (G J_\mathrm{n}) + G \circ (J J_\mathrm{n})$ &
+$J_\mathrm{n} \circ (J^\top G) + J_\mathrm{n} \circ (G^\top J)$ &
+$\frac{1}{2} \big(C + C^\top + C' \big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_9$ & \rule{0pt}{1.9em}$\displaystyle
-    \begin{aligned}
-      & J \circ (J_\mathrm{n} G^\top) + G \circ (J_\mathrm{n} J^\top) +
-      J_\mathrm{n} \circ (J G) \\
-      & + J_\mathrm{n} \circ (G J) + J \circ (G^\top J_\mathrm{n}) + G \circ
-      (J^\top J_\mathrm{n})
-    \end{aligned}
-    $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{2} \big(C + C^\top\big)$ \\
+$\ca{M}_9$ & \rule{0pt}{1.9em}$\displaystyle
+\begin{aligned}
+& J \circ (J_\mathrm{n} G^\top) + G \circ (J_\mathrm{n} J^\top) +
+J_\mathrm{n} \circ (J G) \\
+& + J_\mathrm{n} \circ (G J) + J \circ (G^\top J_\mathrm{n}) + G \circ
+(J^\top J_\mathrm{n})
+\end{aligned}
+$\rule[-1.3em]{0pt}{1em} & & $\frac{1}{2} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_{10}$ & $J \circ (J_\mathrm{n} G) + G \circ (J_\mathrm{n} J)$ &
-    $J_\mathrm{n} \circ (J G^\top) + J_\mathrm{n} \circ (G J^\top)$ &
-    $\frac{1}{2} \big(C + C^\top + C' \big)$ \\
+$\ca{M}_{10}$ & $J \circ (J_\mathrm{n} G) + G \circ (J_\mathrm{n} J)$ &
+$J_\mathrm{n} \circ (J G^\top) + J_\mathrm{n} \circ (G J^\top)$ &
+$\frac{1}{2} \big(C + C^\top + C' \big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_{11}$ & \rule{0pt}{1.9em}$\displaystyle
-    \begin{aligned}
-      & J_\mathrm{d} \circ (G J_\mathrm{n}) + G_\mathrm{d} \circ (J
-      J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G) \\
-      &  + J_\mathrm{n} \circ (G_\mathrm{d} J) + J \circ (G_\mathrm{d}
-      J_\mathrm{n}) + G \circ (J_\mathrm{d} J_\mathrm{n})
-    \end{aligned}
-    $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\
+$\ca{M}_{11}$ & \rule{0pt}{1.9em}$\displaystyle
+\begin{aligned}
+& J_\mathrm{d} \circ (G J_\mathrm{n}) + G_\mathrm{d} \circ (J
+J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G) \\
+&  + J_\mathrm{n} \circ (G_\mathrm{d} J) + J \circ (G_\mathrm{d}
+J_\mathrm{n}) + G \circ (J_\mathrm{d} J_\mathrm{n})
+\end{aligned}
+$\rule[-1.3em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_{12}$ & \rule{0pt}{1.9em}$\displaystyle
-    \begin{aligned}
-      & J_\mathrm{d} \circ (J_\mathrm{n} G) + G_\mathrm{d} \circ (J_\mathrm{n}
-      J) + J_\mathrm{n} \circ (J G_\mathrm{d}) \\
-      & + J_\mathrm{n} \circ (G J_\mathrm{d}) + J \circ (J_\mathrm{n}
-      G_\mathrm{d}) + G \circ (J_\mathrm{n} J_\mathrm{d})
-    \end{aligned}
-    $\rule[-1.3em]{0pt}{1em} & & $ \frac{1}{3} \big(C + C^\top\big)$ \\
+$\ca{M}_{12}$ & \rule{0pt}{1.9em}$\displaystyle
+\begin{aligned}
+& J_\mathrm{d} \circ (J_\mathrm{n} G) + G_\mathrm{d} \circ (J_\mathrm{n}
+J) + J_\mathrm{n} \circ (J G_\mathrm{d}) \\
+& + J_\mathrm{n} \circ (G J_\mathrm{d}) + J \circ (J_\mathrm{n}
+G_\mathrm{d}) + G \circ (J_\mathrm{n} J_\mathrm{d})
+\end{aligned}
+$\rule[-1.3em]{0pt}{1em} & & $ \frac{1}{3} \big(C + C^\top\big)$ \\
 
-    \hline
-
-    $\ca{M}_{13}$ & $J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{n}) +
-    G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{n}) + J_\mathrm{n} \circ
-    (J_\mathrm{d} G_\mathrm{d})$ & & $\frac{1}{4} \big(C + C^\top \big)$ \\
-
-    \hline
+\hline
+
+$\ca{M}_{13}$ & $J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{n}) +
+G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{n}) + J_\mathrm{n} \circ
+(J_\mathrm{d} G_\mathrm{d})$ & & $\frac{1}{4} \big(C + C^\top \big)$ \\
+
+\hline
 
-    $\ca{M}_\mathrm{coll}$ & $J_\mathrm{n} \circ (J G^\top)$ & & $\frac{1}{2}
-    \big( C + C^\top \big)$ \\
+$\ca{M}_\mathrm{coll}$ & $J_\mathrm{n} \circ (J G^\top)$ & & $\frac{1}{2}
+\big( C + C^\top \big)$ \\
 
-    \hline
+\hline
 
-    $\ca{M}_\mathrm{expa}$ & $J_\mathrm{n} \circ (J^\top G)$ & & $\frac{1}{2}
-    \big( C + C^\top \big)$ \\
+$\ca{M}_\mathrm{expa}$ & $J_\mathrm{n} \circ (J^\top G)$ & & $\frac{1}{2}
+\big( C + C^\top \big)$ \\
 
-    \hline
+\hline
 
-  \end{tabular}
-  \caption{Functional motif adjacency matrix formulae}
-  \label{tab:motif_adj_mat_table}
+\end{tabular}
+\caption{Functional motif adjacency matrix formulae}
+\label{tab:motif_adj_mat_table}
 \end{table}
 \clearpage{}
 \clearpage{}
@@ -2268,16 +2268,16 @@ \subsection{Hardware and software} \label{sec:notes_hardware}
 %
 %
 \begin{itemize}
-  \item \textbf{igraph} \cite{r_igraph} for plotting networks
-  \item \textbf{LICORS} \cite{r_LICORS} for an implementation of $k$-means++
-  \item \textbf{mclust} \cite{r_mclust} for an implementation of ARI
-  \item
-    \textbf{rnaturalearth} \cite{r_rnaturalearth} for world territory boundary
-    data
-  \item \textbf{RSpectra} \cite{r_RSpectra} for eigendecomposition of
-    sparse matrices
-  \item \textbf{USAboundaries} \cite{r_USAboundaries} for US
-    county and state boundary data
+\item \textbf{igraph} \cite{r_igraph} for plotting networks
+\item \textbf{LICORS} \cite{r_LICORS} for an implementation of $k$-means++
+\item \textbf{mclust} \cite{r_mclust} for an implementation of ARI
+\item
+\textbf{rnaturalearth} \cite{r_rnaturalearth} for world territory boundary
+data
+\item \textbf{RSpectra} \cite{r_RSpectra} for eigendecomposition of
+sparse matrices
+\item \textbf{USAboundaries} \cite{r_USAboundaries} for US
+county and state boundary data
 \end{itemize}
 
 \subsection{Timings for MAM computations} \label{sec:notes_timing}
@@ -2289,11 +2289,11 @@ \subsection{Timings for MAM computations} \label{sec:notes_timing}
 
 \vspace*{0.3cm}
 \begin{table}[H]
-  \centering \renewcommand{\arraystretch}{1.5}
-  \setlength\tabcolsep{0.2em} \scriptsize
-  \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}
-    \hline
-    \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
+\centering \renewcommand{\arraystretch}{1.5}
+\setlength\tabcolsep{0.2em} \scriptsize
+\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}
+\hline
+\cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ &
@@ -2321,18 +2321,18 @@ \subsection{Timings for MAM computations} \label{sec:notes_timing}
 \hline \cellcolor[HTML]{E9E9E9}
 0.1 & 0.014 & 0.019 & 0.019 & 0.031 & 0.029 & 0.019 & 0.033 & 0.025 &
 0.032 & 0.023 & 0.028 & 0.023 & 0.026 & 0.025 & 0.019
-     \\ \hline
-  \end{tabular}
-  \caption{Timings for MAM computation with $n=100$}%
-  \label{tab:timing_n_100}%
+\\ \hline
+\end{tabular}
+\caption{Timings for MAM computation with $n=100$}%
+\label{tab:timing_n_100}%
 \end{table}
 
 \begin{table}[H]
-  \centering \renewcommand{\arraystretch}{1.5}
-  \setlength\tabcolsep{0.2em} \scriptsize
-  \begin{tabular}{
-    |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline
-    \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
+\centering \renewcommand{\arraystretch}{1.5}
+\setlength\tabcolsep{0.2em} \scriptsize
+\begin{tabular}{
+|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline
+\cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ &
@@ -2360,17 +2360,17 @@ \subsection{Timings for MAM computations} \label{sec:notes_timing}
 \hline \cellcolor[HTML]{E9E9E9}
 0.1 & 0.23 & 0.22 & 0.60 & 1.1 & 0.57 & 0.24 & 1.4 & 0.86 & 0.69 &
 1.5 & 2.3 & 1.6 & 1.6 & 1.6 & 0.67
-     \\ \hline
-  \end{tabular}
-  \caption{Timings
-  for MAM computation with $n=1000$} \label{tab:timing_n_1000}
+\\ \hline
+\end{tabular}
+\caption{Timings
+for MAM computation with $n=1000$} \label{tab:timing_n_1000}
 \end{table}
 
 \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5}
-  \setlength\tabcolsep{0.2em} \scriptsize
-  \begin{tabular}{
-    |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline
-    \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
+\setlength\tabcolsep{0.2em} \scriptsize
+\begin{tabular}{
+|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline
+\cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ &
 \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ &
@@ -2398,10 +2398,10 @@ \subsection{Timings for MAM computations} \label{sec:notes_timing}
 \hline \cellcolor[HTML]{E9E9E9}
 0.1 & 33 & 31 & 170 & 260 & 160 & 53 & 410 &
 210 & 210 & 700 & 1100 & 520 & 760 & 580 & 150
-     \\ \hline
-  \end{tabular}
-  \caption{Timings
-  for MAM computation with $n=10 \, 000$} \label{tab:timing_n_10000}
+\\ \hline
+\end{tabular}
+\caption{Timings
+for MAM computation with $n=10 \, 000$} \label{tab:timing_n_10000}
 \end{table}
 
 \section{Data preprocessing} \label{sec:notes_preprocessing}
@@ -2417,21 +2417,21 @@ \section{US map} \label{sec:notes_us_map}
 %
 \vspace*{-0.8cm}
 \begin{figure}[H]
-  \centering
-  \includegraphics[scale=0.6,draft=false]{%
-  ../../results/us_migration/us_migration_map_state_names.pdf}
-  \vspace*{-0.5cm} \caption{US map with state boundaries and state
-  abbreviations} \label{fig:notes_us_map}
+\centering
+\includegraphics[scale=0.6,draft=false]{%
+../../results/us_migration/us_migration_map_state_names.pdf}
+\vspace*{-0.5cm} \caption{US map with state boundaries and state
+abbreviations} \label{fig:notes_us_map}
 \end{figure}
 
 \section{Word count}
 
 The word count of this dissertation is 6230
- \unskip, obtained
+\unskip, obtained
 using \TeX \hspace*{-0.15cm} count by running
 %
 \begin{center}
-  \texttt{texcount -relaxed -inc -0 -sum=1,1,1,0,0,0,0\,}.
+\texttt{texcount -relaxed -inc -0 -sum=1,1,1,0,0,0,0\,}.
 \end{center}
 %
 %The final dissertation should be no longer than 7,500 words, this usually
diff --git a/tests/example2_out.tex b/tests/masters_dissertation_out.tex
similarity index 100%
rename from tests/example2_out.tex
rename to tests/masters_dissertation_out.tex
diff --git a/tests/phd_dissertation_in.tex b/tests/phd_dissertation_in.tex
new file mode 100644
index 0000000..8919eec
--- /dev/null
+++ b/tests/phd_dissertation_in.tex
@@ -0,0 +1,27572 @@
+% !TeX program = lualatex
+
+%! TeX root = phd_dissertation.tex
+
+\pdfvariable suppressoptionalinfo 512\relax
+\documentclass[11pt,lof]{puthesis}
+
+% packages
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage[amsmath,thmmarks,noconfig]{ntheorem}
+\usepackage{mathtools}
+\usepackage{multirow}
+\usepackage{pgfplots}
+\usepackage{graphicx}
+\usepackage{enumitem}
+\usepackage{subcaption}
+\usepackage{titlesec}
+\usepackage{stackengine}
+\usepackage{scalerel}
+\usepackage{microtype}
+\usepackage[boxruled,linesnumbered,commentsnumbered,procnumbered]{algorithm2e}
+\usepackage[longnamesfirst]{natbib}
+\usepackage[hypertexnames=false,hidelinks]{hyperref}
+\usepackage[norefs,nocites]{refcheck}
+\usepackage[defaultlines=3,all]{nowidow}
+\usepackage{float}
+
+% settings
+\pgfplotsset{compat=1.9}
+\newcommand{\TODO}[1]{\textcolor{red}{\textsc{TODO}: #1}}
+\setcitestyle{round}
+\captionsetup[subfigure]{justification=centering}
+\def\arraystretch{1.3}
+\renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textit{#1}}
+
+% tables numbered as figures
+\def\table{\def\figurename{Table}\figure}
+\let\endtable\endfigure
+\renewcommand\listfigurename{List of Figures and Tables}
+
+% arxiv
+\newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{\texttt{arXiv:#1}}}
+
+% github
+\newcommand{\github}[1]{\href{https://github.com/#1}{\texttt{github.com/#1}}}
+
+% blackboard
+\renewcommand{\P}{\ensuremath{\mathbb{P}}}
+\newcommand{\N}{\ensuremath{\mathbb{N}}}
+\newcommand{\R}{\ensuremath{\mathbb{R}}}
+\newcommand{\E}{\ensuremath{\mathbb{E}}}
+\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
+\newcommand{\I}{\ensuremath{\mathbb{I}}}
+\newcommand{\Z}{\ensuremath{\mathbb{Z}}}
+
+% roman
+\newcommand{\rF}{\ensuremath{\mathrm{F}}}
+\newcommand{\rH}{\ensuremath{\mathrm{H}}}
+\newcommand{\rL}{\ensuremath{\mathrm{L}}}
+\newcommand{\rk}{\ensuremath{\mathrm{k}}}
+\newcommand{\rd}{\ensuremath{\mathrm{d}}}
+\newcommand{\comp}{\ensuremath{\mathrm{c}}}
+\newcommand{\TV}{\mathrm{TV}}
+
+% bold
+\newcommand{\bW}{\ensuremath{\mathbf{W}}}
+\newcommand{\bY}{\ensuremath{\mathbf{Y}}}
+\newcommand{\bX}{\ensuremath{\mathbf{X}}}
+\newcommand{\bT}{\ensuremath{\mathbf{T}}}
+\newcommand{\bA}{\ensuremath{\mathbf{A}}}
+\newcommand{\bV}{\ensuremath{\mathbf{V}}}
+
+% calligraphic
+\newcommand{\cH}{\ensuremath{\mathcal{H}}}
+\newcommand{\cF}{\ensuremath{\mathcal{F}}}
+\newcommand{\cN}{\ensuremath{\mathcal{N}}}
+\newcommand{\cX}{\ensuremath{\mathcal{X}}}
+\newcommand{\cG}{\ensuremath{\mathcal{G}}}
+\newcommand{\cW}{\ensuremath{\mathcal{W}}}
+\newcommand{\cB}{\ensuremath{\mathcal{B}}}
+\newcommand{\cS}{\ensuremath{\mathcal{S}}}
+\newcommand{\cT}{\ensuremath{\mathcal{T}}}
+\newcommand{\cV}{\ensuremath{\mathcal{V}}}
+\newcommand{\cE}{\ensuremath{\mathcal{E}}}
+\newcommand{\cU}{\ensuremath{\mathcal{U}}}
+\newcommand{\cR}{\ensuremath{\mathcal{R}}}
+\newcommand{\cA}{\ensuremath{\mathcal{A}}}
+\newcommand{\cC}{\ensuremath{\mathcal{C}}}
+\newcommand{\cM}{\ensuremath{\mathcal{M}}}
+\newcommand{\cD}{\ensuremath{\mathcal{D}}}
+\newcommand{\cP}{\ensuremath{\mathcal{P}}}
+\newcommand{\cI}{\ensuremath{\mathcal{I}}}
+\newcommand{\cY}{\ensuremath{\mathcal{Y}}}
+
+% sans serif
+\newcommand{\T}{\ensuremath{\mathsf{T}}}
+
+% symbols
+\newcommand{\vvvert}{{\vert\kern-0.25ex\vert\kern-0.25ex\vert}}
+\newcommand{\bigvvvert}{{\big\vert\kern-0.35ex\big\vert\kern-0.35ex\big\vert}}
+\newcommand{\Bigvvvert}{{\Big\vert\kern-0.3ex\Big\vert\kern-0.3ex\Big\vert}}
+\newcommand{\bigsetminus}{\mathbin{\big\backslash}}
+\newcommand{\Bigsetminus}{\mathbin{\Big\backslash}}
+\newcommand{\dprime}{\ensuremath{\prime\prime}}
+\newcommand{\tprime}{\ensuremath{\prime\prime\prime}}
+\newcommand{\objective}{\ensuremath{\mathrm{obj}}}
+\newcommand{\Dl}{\ensuremath{D_{\textup{lo}}}}
+\newcommand{\Du}{\ensuremath{D_{\textup{up}}}}
+
+% floor of beta
+\newcommand{\flbeta}{{\ThisStyle{%
+\ensurestackMath{\stackengine{-0.5\LMpt}{\SavedStyle \beta}%
+{\SavedStyle {\rule{3.7\LMpt}{0.3\LMpt}}}
+{U}{c}{F}{F}{S}}\vphantom{\beta}}}}
+
+% operators
+\DeclareMathOperator{\Var}{Var}
+\DeclareMathOperator{\Cov}{Cov}
+\DeclareMathOperator{\AIMSE}{AIMSE}
+\DeclareMathOperator{\LOOCV}{LOOCV}
+\DeclareMathOperator{\symconv}{symconv}
+\DeclareMathOperator{\GCV}{GCV}
+\DeclareMathOperator{\Unif}{Unif}
+\DeclareMathOperator*{\logistic}{logistic}
+\DeclareMathOperator{\Bias}{Bias}
+\DeclareMathOperator{\Env}{Env}
+\DeclareMathOperator*{\esssup}{ess\,sup}
+\DeclareMathOperator{\Ber}{Ber}
+\DeclareMathOperator{\KL}{KL}
+\DeclareMathOperator{\Gam}{Gam}
+\DeclareMathOperator{\Yule}{Yule}
+\DeclareMathOperator{\rank}{rank}
+\DeclareMathOperator{\Exp}{Exp}
+\DeclareMathOperator{\Bin}{Bin}
+\DeclareMathOperator{\Tr}{Tr}
+\DeclareMathOperator{\Leb}{Leb}
+\DeclareMathOperator*{\argmin}{arg\,min}
+\DeclareMathOperator*{\minimize}{minimize:}
+\DeclareMathOperator*{\subjectto}{subject\ to:}
+\DeclareMathOperator{\ROT}{ROT}
+\newcommand{\diff}[1]{\,\mathrm{d}#1}
+
+% theorem environments
+\renewtheoremstyle{break}{%
+\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
+##2}\hbox{\strut}}}]%
+}{%
+\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
+##2\ \normalfont (##3)}\hbox{\strut}}}]%
+}
+\theoremstyle{break}
+\theorempreskip{7mm}
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{lemma}{Lemma}[section]
+\newtheorem{assumption}{Assumption}[section]
+\newtheorem{corollary}{Corollary}[section]
+\newtheorem{proposition}{Proposition}[section]
+\newtheorem{definition}{Definition}[section]
+\newtheorem{remark}{Remark}[section]
+
+% proof environments
+\let\proof\relax
+\newtheoremstyle{proof}{%
+\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
+}\hbox{\strut}}}]%
+}{%
+\item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ %
+\normalfont (##3)}\hbox{\strut}}}]%
+}
+\theoremstyle{proof}
+\theorembodyfont{\upshape}
+\theorempreskip{7mm}
+\theoremsymbol{\ensuremath{\square}}
+\newtheorem{proof}{Proof}
+\AtBeginEnvironment{proof}{\setcounter{proofparagraphcounter}{0}}%
+
+% proof paragraphs
+\titleformat{\paragraph}[hang]{\bfseries\upshape}{}{0pt}{}[]
+\titlespacing*{\paragraph}{0pt}{6pt}{0pt}
+\newcounter{proofparagraphcounter}
+\newcommand{\proofparagraph}[1]{
+\refstepcounter{proofparagraphcounter}%
+\paragraph{Part \theproofparagraphcounter : #1}}%
+
+% inline roman lists
+\newlist{inlineroman}{enumerate*}{1}
+\setlist[inlineroman]{afterlabel=~,label=(\roman*)}
+
+% algorithms
+\DontPrintSemicolon%
+\makeatletter%
+\renewcommand{\SetKwInOut}[2]{%
+\sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}%
+\expandafter\ifx\csname InOutSizeDefined\endcsname\relax%
+\newcommand\InOutSizeDefined{}%
+\setlength{\inoutsize}{\wd\algocf@inoutbox}%
+\sbox\algocf@inoutbox{%
+\parbox[t]{\inoutsize}%
+{\KwSty{#2}\algocf@typo:\hfill}~%
+}%
+\setlength{\inoutindent}{\wd\algocf@inoutbox}%
+\else%
+\ifdim\wd\algocf@inoutbox>\inoutsize%
+\setlength{\inoutsize}{\wd\algocf@inoutbox}%
+\sbox\algocf@inoutbox{%
+\parbox[t]{\inoutsize}%
+{\KwSty{#2}\algocf@typo:\hfill}~%
+}%
+\setlength{\inoutindent}{\wd\algocf@inoutbox}%
+\fi%
+\fi%
+\algocf@newcommand{#1}[1]{%
+\ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}{%
+\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]%
+{\inoutsize}{\KwSty{#2}%
+\algocf@typo:\hfill}~##1\par%
+}%
+\algocf@linesnumbered%
+}%
+}%
+\makeatother%
+\SetKwInOut{Input}{Input}%
+\SetKwInOut{Output}{Output}%
+\setlength{\algomargin}{2em}%
+
+\author{William George Underwood}
+\adviser{Matias Damian Cattaneo}
+\title{Estimation and Inference in \\ Modern Nonparametric Statistics}
+
+\abstract{
+
+% 350 words max
+
+Nonparametric methods are central to modern statistics, enabling data analysis
+with minimal assumptions in a wide range of scenarios. While contemporary
+procedures such as random forests and kernel methods are popular due to their
+performance and flexibility, their statistical properties are often less well
+understood. The availability of sound inferential techniques is vital in the
+sciences, allowing researchers to quantify uncertainty in their models. We
+develop methodology for robust and practical statistical estimation and
+inference in some modern nonparametric settings involving complex estimators
+and nontraditional data.
+
+We begin in the regression setting by studying the Mondrian random forest, a
+variant in which the partitions are drawn from a Mondrian process. We present a
+comprehensive analysis of the statistical properties of Mondrian random
+forests, including a central limit theorem for the estimated regression
+function and a characterization of the bias. We show how to conduct feasible
+and valid nonparametric inference by constructing confidence intervals, and
+further provide a debiasing procedure that enables minimax-optimal estimation
+rates for smooth function classes in arbitrary dimension.
+
+Next, we turn our attention to nonparametric kernel density estimation with
+dependent dyadic network data. We present results for minimax-optimal
+estimation, including a novel lower bound for the dyadic uniform convergence
+rate, and develop methodology for uniform inference via confidence bands and
+counterfactual analysis. Our methods are based on strong approximations and are
+designed to be adaptive to potential dyadic degeneracy. We give empirical
+results with simulated and real-world economic trade data.
+
+Finally, we develop some new probabilistic results with applications to
+nonparametric statistics. Coupling has become a popular approach for
+distributional analysis in recent years, and Yurinskii's method stands out for
+its wide applicability and explicit formulation. We present a generalization of
+Yurinskii's coupling, treating approximate martingale data under weaker
+conditions than previously imposed. We allow for Gaussian mixture coupling
+distributions, and a third-order method permits faster rates in certain
+situations. We showcase our results with applications to factor models and
+martingale empirical processes, as well as nonparametric partitioning-based and
+local polynomial regression procedures.
+}
+\acknowledgments{
+
+I am extremely fortunate to have been surrounded by many truly wonderful people
+over the course of my career, and without their support this dissertation would
+not have been possible. While it is impossible for me to identify every one of
+them individually, I would like to mention a few names in particular to
+recognize those who have been especially important to me during the last few
+years.
+
+Firstly, I would like to express my utmost gratitude to my Ph.D.\ adviser,
+Matias Cattaneo. Working with Matias has been genuinely inspirational for me,
+and I could not have asked for a more rewarding start to my journey as a
+researcher. From the very beginning, he has guided me expertly through my
+studies, providing hands-on assistance when required while also allowing me the
+independence necessary to develop as an academic. I hope that, during the four
+years we have worked together, I have acquired just a fraction of his formidable
+mathematical intuition, keen attention to detail, boundless creativity, and
+inimitable pedagogical skill. Alongside his role as my adviser, Matias has been
+above all a friend, who has been in equal measure inspiring,
+insightful, dedicated, understanding, and kind.
+
+Secondly, I would like to thank all of the faculty members at Princeton and
+beyond who have acted as my collaborators and mentors, without whom none of my
+work could have been realized. In particular, I express my gratitude to my
+tireless Ph.D.\ committee members and letter writers Jianqing Fan and Jason
+Klusowski, my coauthors Yingjie Feng and Ricardo Masini, my dissertation reader
+Boris Hanin, my teachers
+Amir Ali Ahmadi, Ramon van Handel, Mikl{\'o}s R{\'a}cz, and Mykhaylo Shkolnikov,
+my colleagues Sanjeev Kulkarni and Roc{\'i}o Titiunik,
+and my former supervisor Mihai Cucuringu.
+I am also thankful for the staff members at Princeton who have been
+perpetually helpful, and I would like to identify Kim
+Lupinacci in particular; her assistance in all things administrative has been
+invaluable.
+
+I am grateful to my fellow graduate students in the ORFE department for their
+technical expertise and generosity with their time, and for making Sherrerd
+Hall such a vibrant and exciting space, especially
+Jose Avilez,
+Pier Beneventano,
+Ben Budway,
+Rajita Chandak,
+Abraar Chaudhry,
+Stefan Clarke,
+Giulia Crippa,
+G{\"o}k{\c c}e Dayan{\i}kl{\i},
+Nicolas Garcia,
+Felix Hoefer,
+Erica Lai,
+Jackie Lok,
+Maya Mutic,
+Dan Rigobon,
+Till Saenger,
+Rajiv Sambharya,
+Boris Shigida,
+Igor Silin,
+Giang Truong,
+and Rae Yu.
+Our
+regular social events made a contribution to my well-being which is difficult
+to overstate. My thanks extend also to the students I taught, as
+well as to my group of senior thesis undergraduates, for their commitment,
+patience, and responsiveness.
+
+More broadly, I would like to thank all of my friends, near and far,
+for their unfailing support and reliability, and for
+helping to create so many of my treasured memories. In particular,
+Ole Agersnap,
+James Ashford,
+Christian Baehr,
+Chris Bambic,
+Kevin Beeson,
+James Broadhead,
+Alex Cox,
+Reece Edmends,
+Robin Franklin,
+Greg Henderson,
+Bonnie Ko,
+Grace Matthews,
+Dan Mead,
+Ben Musachio,
+Jacob Neis,
+Monika Papayova,
+Will Pedrick,
+Oliver Philcox,
+Nandita Rao,
+Alex Rice,
+Edward Rowe,
+David Snyder,
+Titi Sodimu,
+Nikitas Tampakis,
+and Anita Zhang.
+Thank you to the Princeton Chapel Choir for being such a wonderful
+community of musicians and a source of close friends,
+and to our directors, Nicole Aldrich and Penna Rose, and organist Eric Plutz.
+
+Lastly, yet most importantly, I want to thank my family for their unwavering
+support throughout my studies. My visits back home have been a source of joy
+throughout my long and often challenging Ph.D., and I cherish every moment I
+have spent with my parents, sister, grandparents, and extended family.
+}
+
+\begin{document}
+
+
+\chapter{Introduction}
+
+% nonparametric estimation is common
+Nonparametric estimation procedures are at the heart of many contemporary
+theoretical and methodological topics within the fields of statistics, data
+science, and machine learning. Where classical parametric techniques impose
+specific distributional and structural assumptions when modeling statistical
+problems, nonparametric methods instead take a more flexible approach,
+typically positing only high-level restrictions such as moment conditions,
+independence criteria, and smoothness assumptions. Examples of such procedures
+abound in modern data science and machine learning, encompassing histograms,
+kernel estimators, smoothing splines, decision trees, nearest neighbor methods,
+random forests, neural networks, and many more.
+
+% nonparametric estimation is good
+The benefits of the nonparametric framework are clear: statistical procedures
+can be formulated in cases where the stringent assumptions of parametric models
+are untestable, demonstrably violated, or simply unreasonable.
+As a consequence,
+the resulting methods often inherit desirable robustness properties against
+various forms of misspecification or misuse. The class of problems that can be
+formulated is correspondingly larger: arbitrary distributions and
+relationships can be characterized and estimated in a principled manner.
+
+% nonparametric estimation is hard
+Nonetheless, these attractive properties do come at a price. In particular, as
+its name suggests, the nonparametric approach forgoes the ability to reduce
+a complex statistical problem to that of estimating a fixed, finite number of
+parameters. Rather, nonparametric procedures typically involve making inferences
+about a growing number of parameters simultaneously, as witnessed in
+high-dimensional regimes, or even directly handling infinite-dimensional
+objects such as entire regression or density functions. As a consequence,
+nonparametric estimators are usually less efficient than their
+correctly specified parametric counterparts, when they are available; rates of
+convergence tend to be slower, and confidence sets more conservative. Another
+challenge is that theoretical mathematical analyses of nonparametric estimators
+are often significantly more demanding than those required for low-dimensional
+parametric settings, necessitating tools from contemporary developments in
+high-dimensional concentration phenomena, coupling and strong approximation
+theory, empirical processes, mathematical optimization, and stochastic
+calculus.
+
+% nonparametric inference
+In addition to providing accurate point estimates of unknown (possibly
+high-dimensional or infinite-dimensional) quantities of interest, modern
+nonparametric procedures are also expected to come equipped with methodologies
+for conducting statistical inference. The availability of such inferential
+techniques is paramount, with contemporary nonparametric methods forming a
+ubiquitous component of modern data science tool kits. Valid uncertainty
+quantification is essential for hypothesis testing, error bar construction,
+assessing statistical significance, and performing power analyses. Inference is
+a central concept in classical statistics, and despite the rapid
+recent development of theory for modern nonparametric estimators, their
+applicability to statistical inference is in certain cases rather less well
+studied; theoretically sound and practically implementable inference procedures
+are sometimes absent in the literature.
+
+% complex data
+In any statistical modeling problem, the selection and application of an
+estimator must naturally be tailored to the available data. Today, much of the
+data produced and analyzed does not necessarily fit neatly into the classical
+framework of independent and identically distributed samples, and instead might
+consist of time series, stochastic processes, networks,
+or high-dimensional or functional data, to name just a few.
+Therefore, it is important to understand how nonparametric methods might be
+adapted to correctly handle these data types, maintaining fast estimation rates
+and valid techniques for statistical inference. The technical challenges
+associated with such an endeavor are non-trivial; many standard techniques are
+ineffective in the presence of dependent or infinite-dimensional data, for
+example. As such, the development of new mathematical results in probability
+theory plays an important role in the comprehensive treatment of nonparametric
+statistics with complex data.
+
+\section*{Overview of the dissertation}
+
+% what we do
+This dissertation presents a selection of topics relating to nonparametric
+estimation and inference, and the associated technical mathematical tools.
+
+% mondrian
+Chapter~\ref{ch:mondrian}, titled ``Inference with Mondrian Random Forests,''
+is based on the work of \citet{cattaneo2023inference}.
+% what are random forests
+Random forests are popular ensembling-based methods for classification and
+regression, which are well known for their good performance, flexibility,
+robustness, and efficiency. The majority of random forest models share the
+following common framework for producing estimates of a classification or
+regression function using covariates and a response variable. Firstly, the
+covariate space is partitioned in some algorithmic manner, possibly using a
+source of external randomness. Secondly, a local estimator of the
+classification or regression function is fitted to the responses in each cell
+separately, yielding a tree estimator. Finally, this process is repeated with
+many different partitions, and the resulting tree estimators are averaged to
+produce a random forest.
+
+% why are there variants
+Many different variants of random forests have been proposed in recent years,
+typically with the aim of improving their statistical or computational
+properties, or simplifying their construction in order to permit a more
+detailed theoretical analysis.
+% mondrian random forests
+One interesting such example is that of the Mondrian random forest, in which
+the underlying partitions (or trees) are constructed independently of the data.
+Naturally, this restriction rules out many classical random forest models, which
+exhibit a complex and data-dependent partitioning scheme. Instead, trees are
+sampled from a canonical stochastic process known as the Mondrian process,
+which endows the resulting tree and forest estimators with various agreeable
+features.
+
+% what we do
+We study the estimation and inference properties of Mondrian
+random forests in the nonparametric regression setting. In particular, we
+establish a novel central limit theorem for the estimates made by a Mondrian
+random forest which, when combined with a characterization of the bias and a
+consistent variance estimator, allows one to perform asymptotically valid
+statistical inference, such as constructing confidence intervals, on the
+unknown regression function. We also provide a debiasing procedure for Mondrian
+random forests, which allows them to achieve minimax-optimal estimation rates
+with H{\"o}lder smooth regression functions, for any smoothness parameter and
+in arbitrary dimension.
+
+% kernel
+Chapter~\ref{ch:kernel}, titled ``Dyadic Kernel Density Estimators,'' is based
+on the work of \citet{cattaneo2024uniform}. Network data plays an important role
+in statistics, econometrics, and many other data science disciplines, providing
+a natural framework for modeling relationships between units, be they people,
+financial institutions, proteins, or economic entities. Of prominent interest
+is the task of performing statistical estimation and inference with data
+sampled from the edges of such networks, known as dyadic data. The archetypal
+lack of independence between edges in a network renders many classical
+statistical tools unsuited for direct application. As such, researchers must
+appeal to techniques tailored to dyadic data in order to accurately capture the
+complex structure present in the network.
+
+% broad scope
+We focus on nonparametric estimation and inference with dyadic
+data, and in particular we seek methods that are robust in the sense that our
+results should hold uniformly across the support of the data. Such uniformity
+guarantees allow for statistical inference in a broader range of settings,
+including specification testing and distributional counterfactual analysis. We
+specifically consider the problem of uniformly estimating a dyadic
+density function, focusing on kernel estimators taking the form of dyadic
+empirical processes.
+
+% main contributions
+Our main contributions include the minimax-optimal uniform convergence rate of
+the dyadic kernel density estimator, along with strong approximation results
+for the associated standardized and Studentized $t$-processes. A consistent
+variance estimator enables the construction of feasible uniform
+confidence bands for the unknown density function. We showcase the broad
+applicability of our results by developing novel counterfactual density
+estimation and inference methodology for dyadic data, which can be used for
+causal inference and program evaluation.
+% why it is difficult
+A crucial feature of dyadic distributions is that they may be ``degenerate'' at
+certain points in the support of the data, a property that makes our analysis
+somewhat delicate. Nonetheless, our methods for uniform inference remain robust
+to the potential presence of such points.
+% applications
+For implementation purposes, we discuss inference procedures based on positive
+semi-definite covariance estimators, mean squared error optimal bandwidth
+selectors, and robust bias correction. We illustrate the empirical
+performance of our methods in simulations and with
+real-world trade data, for which we make comparisons between observed and
+counterfactual trade distributions in different years. Our technical results
+on strong approximations and maximal inequalities are of potential
+independent interest.
+
+% yurinskii
+Finally, Chapter~\ref{ch:yurinskii}, titled ``Yurinskii's Coupling for
+Martingales,'' is based on the work of \citet{cattaneo2022yurinskii}.
+Yurinskii's coupling is a popular theoretical tool for non-asymptotic
+distributional analysis in mathematical statistics and applied probability.
+Coupling theory, also known as strong approximation, provides an alternative
+framework to the more classical weak convergence approach to statistical
+analysis. Rather than merely approximating the distribution of a random
+variable, strong approximation techniques construct a sequence of random
+variables which are close almost surely or in probability, often with
+finite-sample guarantees.
+
+% what is it used for
+Coupling allows distributional analysis in settings where weak convergence
+fails, including many applications to nonparametric or high-dimensional
+statistics; it is a key technical component in the main strong approximation
+results of our Chapter~\ref{ch:kernel}. The Yurinskii method specifically
+offers a Gaussian coupling with an explicit error bound under easily verified
+conditions; originally stated in $\ell^2$-norm for sums of independent random
+vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p
+\leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some
+strong conditions.
+
+% what we do
+We present as our main result a Yurinskii coupling for approximate martingales
+in $\ell^p$-norm, under substantially weaker conditions than previously
+imposed. Our formulation allows the coupling variable to follow a
+general Gaussian mixture distribution, and we provide a novel third-order
+coupling method which gives tighter approximations in certain situations. We
+specialize our main result to mixingales, martingales, and independent data,
+and derive uniform Gaussian mixture strong approximations for martingale
+empirical processes. Applications to nonparametric partitioning-based and local
+polynomial regression procedures are provided.
+
+% appendices
+Supplementary materials for Chapters~\ref{ch:mondrian}, \ref{ch:kernel}, and
+\ref{ch:yurinskii} are provided in Appendices~\ref{app:mondrian},
+\ref{app:kernel}, and \ref{app:yurinskii} respectively. These contain detailed
+proofs of the main results, additional technical contributions, and further
+discussion.
+
+\chapter[Inference with Mondrian Random Forests]%
+{Inference with \\ Mondrian Random Forests}
+\label{ch:mondrian}
+
+% abstract
+Random forests are popular methods for classification and regression, and many
+different variants have been proposed in recent years. One interesting example
+is the Mondrian random forest, in which the underlying trees are constructed
+according to a Mondrian process. In this chapter we give a central limit theorem
+for the estimates made by a Mondrian random forest in the regression setting.
+When combined with a bias characterization and a consistent variance estimator,
+this allows one to perform asymptotically valid statistical inference, such as
+constructing confidence intervals, on the unknown regression function. We also
+provide a debiasing procedure for Mondrian random forests which allows them to
+achieve minimax-optimal estimation rates with $\beta$-H{\"o}lder regression
+functions, for all $\beta$ and in arbitrary dimension, assuming appropriate
+parameter tuning.
+
+\section{Introduction}
+
+Random forests, first introduced by \citet{breiman2001random}, are a workhorse
+in modern machine learning for classification and regression tasks.
+Their desirable traits include computational efficiency (via parallelization
+and greedy heuristics) in big data settings, simplicity of configuration and
+amenability to tuning parameter selection, ability to adapt to latent structure
+in high-dimensional data sets, and flexibility in handling mixed data types.
+Random forests have achieved great empirical successes in many fields of study,
+including healthcare, finance, online commerce, text analysis, bioinformatics,
+image classification, and ecology.
+
+Since Breiman introduced random forests over twenty years ago, the study of
+their statistical properties remains an active area of research: see
+\citet{scornet2015consistency}, \citet{chi2022asymptotic},
+\citet{klusowski2024large}, and references therein, for a sample of recent
+developments. Many fundamental questions about Breiman's random forests remain
+unanswered, owing in part to the subtle ingredients present in the estimation
+procedure which make standard analytical tools ineffective. These technical
+difficulties stem from the way the constituent trees greedily partition the
+covariate space, utilizing both the covariate and response data. This creates
+complicated dependencies on the data which are often exceedingly hard to
+untangle without overly stringent assumptions, thereby hampering theoretical
+progress.
+
+To address the aforementioned technical challenges while retaining the
+phenomenology of Breiman's random forests, a variety of stylized versions of
+random forest procedures have been proposed and studied in the literature.
+These include centered random forests
+\citep{biau2012analysis,arnould2023interpolation} and median random forests
+\citep{duroux2018impact,arnould2023interpolation}. Each tree in a centered
+random forest is constructed by first choosing a covariate uniformly at random
+and then splitting the cell at the midpoint along the direction of the chosen
+covariate. Median random forests operate in a similar way, but involve the
+covariate data by splitting at the empirical median along the direction of the
+randomly chosen covariate. Known as purely random forests, these procedures
+simplify Breiman's original---albeit more data-adaptive---version by growing
+trees that partition the covariate space in a way that is statistically
+independent of the response data.
+
+Yet another variant of random forests, Mondrian random forests
+\citep{lakshminarayanan2014mondrian}, have received significant attention in
+the statistics and machine learning communities in recent years
+\citep{ma2020isolation, mourtada2020minimax, scillitoe2021uncertainty,
+mourtada2021amf, vicuna2021reducing, gao2022towards, oreilly2022stochastic}.
+Like
+other purely random forest variants, Mondrian random forests offer a simplified
+modification of Breiman's original proposal in which the partition is generated
+independently of the data and according to a canonical stochastic process known
+as the Mondrian process \citep{roy2008mondrian}. The Mondrian process takes a
+single parameter $\lambda > 0$ known as the ``lifetime'' and enjoys various
+mathematical properties. These probabilistic
+features allow Mondrian random forests to be
+fitted in an online manner as well as being subject to a rigorous statistical
+analysis, while also retaining some of the appealing features of other
+more traditional random forest methods.
+
+This chapter studies the statistical properties of Mondrian random forests. We
+focus on this purely random forest variant not only because of its importance
+in the development of random forest theory in general, but also because the
+Mondrian process is, to date, the only known recursive tree mechanism involving
+randomization, pure or data-dependent, for which the resulting random forest is
+minimax-optimal for point estimation over a class of smooth regression
+functions in arbitrary dimension \citep{mourtada2020minimax}. In fact, when the
+covariate dimension exceeds one, the aforementioned centered and median random
+forests are both minimax-\emph{suboptimal}, due to their large biases, over the
+class of Lipschitz smooth regression functions \citep{klusowski2021sharp}. It
+is therefore natural to focus our study of inference for random forests on
+versions that at the very least exhibit competitive bias and variance, as this
+will have important implications for the trade-off between precision and
+confidence.
+
+Despite their recent popularity, relatively little is known about the formal
+statistical properties of Mondrian random forests. Focusing on nonparametric
+regression, \citet{mourtada2020minimax} recently showed that Mondrian forests
+containing just a single tree (called a Mondrian tree) can be minimax-optimal
+in integrated mean squared error whenever the regression function is
+$\beta$-H{\"o}lder continuous for some $\beta \in (0, 1]$. The authors also
+showed that, when appropriately tuned, large Mondrian random forests can be
+similarly minimax-optimal for $\beta \in (0, 2]$, while the constituent trees
+cannot. See also \citet{oreilly2022stochastic} for analogous results for more
+general
+Mondrian tree and forest constructions. These results formally demonstrate the
+value of ensembling with random forests from a point estimation perspective. No
+results are currently available in the literature for statistical inference
+using Mondrian random forests.
+
+This chapter contributes to the literature on the foundational statistical
+properties of Mondrian random forest regression estimation with two main
+results. Firstly, we give a central limit theorem for the classical Mondrian
+random forest point estimator, and propose valid large-sample inference
+procedures employing a consistent standard error estimator. We establish this
+result by deploying a martingale central limit theorem
+\citep[Theorem~3.2]{hall1980martingale} because we need to handle delicate
+probabilistic features of the Mondrian random forest estimator. In particular,
+we deal with the existence of Mondrian cells which are ``too small'' and lead
+to a reduced effective (local) sample size for some trees in the forest. Such
+pathological cells are in fact typical in Mondrian random forests and
+complicate the probability limits of certain sample averages; in fact, small
+Mondrian random forests (or indeed single Mondrian trees) remain random even
+in the limit due to the lack of ensembling. The presence of small cells
+renders inapplicable prior distributional approximation results for
+partitioning-based estimators in the literature
+\citep{huang2003local,cattaneo2020large}, since the commonly required
+quasi-uniformity assumption on the underlying partitioning scheme is violated
+by cells generated using the Mondrian process. We circumvent this
+technical challenge by establishing new theoretical results for Mondrian
+partitions and their associated Mondrian trees and forests, which may be of
+independent interest.
+
+The second main contribution of the chapter is to propose a debiasing approach
+for the Mondrian random forest point estimator. We accomplish this by first
+precisely characterizing the probability limit of the large sample conditional
+bias, and then applying a debiasing procedure based on the generalized
+jackknife \citep{schucany1977improvement}. We thus exhibit a Mondrian random
+forest variant which is minimax-optimal in pointwise mean squared error when
+the regression function is $\beta$-H{\"o}lder for any $\beta > 0$. Our method
+works by generating an ensemble of Mondrian random forests carefully chosen to
+have smaller misspecification bias when extra smoothness is available,
+resulting in minimax optimality even for $\beta > 2$. This result complements
+\citet{mourtada2020minimax} by demonstrating the existence of a class of
+Mondrian random forests that can efficiently exploit the additional smoothness
+of the unknown regression function for minimax-optimal point estimation. Our
+proposed debiasing procedure is also useful when conducting statistical
+inference because it provides a principled method for ensuring that the bias is
+negligible relative to the standard deviation of the estimator. More
+specifically, we use our debiasing approach to construct valid inference
+procedures based on robust bias correction
+\citep{calonico2018effect,calonico2022coverage}.
+
+This chapter is structured as follows. In Section~\ref{sec:mondrian_setup} we
+introduce the Mondrian process and give our assumptions on the data generating
+process, using a H{\"o}lder smoothness condition on the regression function to
+control the bias of various estimators. We define the Mondrian random forest
+estimator and present our assumptions on its lifetime parameter and the number
+of trees. We give our notation for the following sections in this chapter.
+
+Section~\ref{sec:mondrian_inference} presents our first set of main results,
+beginning with a central limit theorem for the centered Mondrian random forest
+estimator (Theorem~\ref{thm:mondrian_clt}), in which we characterize the
+limiting
+variance. Theorem~\ref{thm:mondrian_bias} complements this result by precisely
+calculating the limiting bias of the estimator, with the aim of subsequently
+applying a debiasing procedure. To enable valid feasible statistical inference,
+we provide a consistent variance estimator in
+Theorem~\ref{thm:mondrian_variance_estimation} and briefly discuss implications
+for
+lifetime parameter selection.
+
+In Section~\ref{sec:mondrian_overview_proofs} we provide a brief overview of
+the proofs
+of these first main results. We focus on the technical innovations and general
+strategic approach, giving some insight into the challenges involved, and refer
+the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs.
+
+In Section~\ref{sec:mondrian_debiased} we define debiased Mondrian random
+forests, a
+collection of estimators based on linear combinations of Mondrian random
+forests with varying lifetime parameters. These parameters are carefully chosen
+to annihilate leading terms in our bias characterization, yielding an estimator
+with provably superior bias properties
+(Theorem~\ref{thm:mondrian_bias_debiased}). In
+Theorem~\ref{thm:mondrian_clt_debiased}
+we verify that a central limit theorem continues to hold for the debiased
+Mondrian random forest. We again state the limiting variance, discuss the
+implications for the lifetime parameter, and provide a consistent variance
+estimator (Theorem~\ref{thm:mondrian_variance_estimation_debiased}) for
+constructing
+confidence intervals (Theorem~\ref{thm:mondrian_confidence_debiased}). As a
+final
+corollary of the improved bias properties, we demonstrate in
+Theorem~\ref{thm:mondrian_minimax} that the debiased Mondrian random forest
+estimator is minimax-optimal in pointwise mean squared error for all
+$\beta > 0$, provided that $\beta$ is known a priori.
+
+Section~\ref{sec:mondrian_parameter_selection} discusses tuning parameter
+selection,
+beginning with a data-driven approach to selecting the crucial lifetime
+parameter using polynomial estimation, alongside other practical suggestions
+including generalized cross-validation.
+We also give advice on choosing the number of trees, and other parameters
+associated with the debiasing procedure.
+
+In Section~\ref{sec:mondrian_weather} we present an illustrative example
+application of our proposed methodology for estimation and inference in the
+setting of weather forecasting in Australia. We demonstrate the use of
+our debiased Mondrian random forest estimator and our
+generalized cross-validation procedure for lifetime parameter selection,
+as well as the construction of point estimates and confidence intervals.
+
+Concluding remarks are given in Section~\ref{sec:mondrian_conclusion}, while
+Appendix~\ref{app:mondrian} contains all the mathematical proofs of our
+theoretical contributions, along with some other technical
+probabilistic results on the Mondrian process which may be of interest.
+
+\subsection{Notation}
+
+We write $\|\cdot\|_2$ for the usual Euclidean $\ell^2$-norm on $\R^d$. The
+natural numbers are $\N = \{0, 1, 2, \ldots \}$. We use $a \wedge b$ for the
+minimum and $a \vee b$ for the maximum of two real numbers. For a set $A$, we
+use $A^{\comp}$ for the complement whenever the background space is clear from
+context. We use $C$ to denote a positive constant whose value may change from
+line to line. For non-negative sequences $a_n$ and $b_n$, write
+$a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded
+for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If
+$a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random
+non-negative sequences $A_n$ and $B_n$, similarly write $A_n \lesssim_\P B_n$
+or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability,
+and $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. Convergence of
+random variables $X_n$ in distribution to a law $\P$ is denoted by
+$X_n \rightsquigarrow \P$.
+
+\section{Setup}
+\label{sec:mondrian_setup}
+
+When using a Mondrian random forest, there are two sources of randomness. The
+first is of course the data, and here we consider the nonparametric regression
+setting with $d$-dimensional covariates. The second source is a collection of
+independent trees drawn from a Mondrian process, which we define in the
+subsequent section, using a specified lifetime parameter.
+
+\subsection{The Mondrian process}
+\label{sec:mondrian_process}
+
+The Mondrian process was introduced by \citet{roy2008mondrian} and offers a
+canonical method for generating random rectangular partitions, which can be
+used as the trees for a random forest
+\citep{lakshminarayanan2014mondrian,lakshminarayanan2016mondrian}. For
+the reader's convenience, we give a brief description of this process here; see
+\citet[Section~3]{mourtada2020minimax} for a more complete definition.
+
+For a fixed dimension $d$ and lifetime parameter $\lambda > 0$, the Mondrian
+process is a stochastic process taking values in the set of finite rectangular
+partitions of $[0,1]^d$. For a rectangle
+$D = \prod_{j=1}^d [a_j, b_j] \subseteq [0,1]^d$,
+we denote the side aligned with dimension $j$ by $D_j = [a_j, b_j]$, write
+$D_j^- = a_j$ and $D_j^+ = b_j$ for its left and right endpoints respectively,
+and use $|D_j| = D_j^+ - D_j^-$ for its length. The volume of $D$ is
+$|D| = \prod_{j=1}^{d} |D_j|$ and its linear dimension (or half-perimeter) is
+$|D|_1 = \sum_{j=1}^{d} |D_j|$.
+
+To sample a partition $T$ from the Mondrian process
+$\cM \big( [0,1]^d, \lambda \big)$ we start at time $t=0$ with the trivial
+partition of $[0,1]^d$ which has no splits. We then repeatedly apply the
+following procedure to each cell $D$ in the partition. Let $t_D$ be the time at
+which the cell was formed, and sample $E_D \sim \Exp \left( |D|_1 \right)$. If
+$t_D + E_D \leq \lambda$, then we split $D$. This is done by first selecting a
+split dimension $J$ with $\P(J=j) = |D_j| / |D|_1$, and then sampling a split
+location $S_J \sim \Unif\big[D_J^-, D_J^+\big]$. The cell $D$ splits into the
+two new cells $\{x \in D : x_J \leq S_J\}$ and $\{x \in D : x_J > S_J\}$, each
+with formation time $t_D + E_D$. The final outcome is the partition $T$
+consisting of the cells $D$ which were not split because $t_D + E_D > \lambda$.
+The cell in $T$ containing a point $x \in [0,1]^d$ is written $T(x)$.
+Figure~\ref{fig:mondrian_process} shows typical realizations of
+$T \sim \cM\big( [0,1]^d, \lambda \big)$ for $d=2$ and with different lifetime
+parameters $\lambda$.
+%
+\begin{figure}[t]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/plot_mondrian_process_1.pdf}
+\caption{$\lambda = 3$}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/plot_mondrian_process_2.pdf}
+\caption{$\lambda = 10$}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/plot_mondrian_process_3.pdf}
+\caption{$\lambda = 30$}
+\end{subfigure}
+%
+\caption[The Mondrian process]{
+The Mondrian process $T \sim \cM \big( [0,1]^d, \lambda \big)$ with
+$d=2$ and lifetime parameters $\lambda$.}
+\label{fig:mondrian_process}
+\end{figure}
+
+\subsection{Data generation}
+
+Throughout this chapter, we assume that the data satisfies
+Assumption~\ref{ass:mondrian_data}. We begin with a definition of H{\"o}lder
+continuity which will be used for controlling the bias of various estimators.
+
+\begin{definition}[H{\"o}lder continuity]%
+
+Take $\beta > 0$ and define $\flbeta$ to be the largest integer which is
+strictly less than $\beta$. We say a function $g: [0,1]^d \to \R$ is
+$\beta$-H{\"o}lder continuous and write $g \in \cH^\beta$ if $g$ is $\flbeta$
+times differentiable and
+$\max_{|\nu| = \flbeta}
+\left| \partial^\nu g(x) - \partial^{\nu} g(x') \right|
+\leq C \|x-x'\|_2^{\beta - \flbeta}$
+for some constant $C > 0$ and all $x, x' \in [0,1]^d$. Here, $\nu \in \N^d$
+is a multi-index with $|\nu| = \sum_{j=1}^d \nu_j$ and
+$\partial^{\nu} g(x) = \partial^{|\nu|} g(x) \big/
+\prod_{j=1}^d \partial x_j^{\nu_j}$. We say $g$ is Lipschitz if $g \in \cH^1$.
+
+\end{definition}
+
+\begin{assumption}[Data generation]%
+\label{ass:mondrian_data}
+
+Fix $d \geq 1$ and let $(X_i, Y_i)$ be i.i.d.\ samples from a distribution on
+$\R^d \times \R$, writing $\bX = (X_1, \ldots, X_n)$ and
+$\bY = (Y_1, \ldots, Y_n)$. Suppose $X_i$ has a Lebesgue density function
+$f(x)$ on $[0,1]^d$ which is bounded away from zero and satisfies
+$f \in \cH^\beta$ for some $\beta \geq 1$. Suppose $\E[Y_i^2 \mid X_i]$ is
+bounded, let $\mu(X_i) = \E[Y_i \mid X_i]$, and assume $\mu \in \cH^\beta$.
+Write $\varepsilon_i = Y_i - \mu(X_i)$ and assume
+$\sigma^2(X_i) = \E[\varepsilon_i^2 \mid X_i]$
+is Lipschitz and bounded away from zero.
+
+\end{assumption}
+
+Some comments are in order surrounding Assumption~\ref{ass:mondrian_data}. The
+requirement that the covariate density $f(x)$ be strictly positive on all of
+$[0,1]^d$ may seem strong, particularly when $d$ is moderately large. However,
+since our theory is presented pointwise in $x$, it is sufficient for this to
+hold only on some neighborhood of $x$. To see this, note that continuity
+implies the density is positive on some hypercube containing $x$. Upon
+rescaling the covariates, we can map this hypercube onto $[0,1]^d$. The same
+argument of course holds for the H{\"o}lder smoothness assumptions and the
+upper and lower bounds on the conditional variance function.
+
+\subsection{Mondrian random forests}
+\label{sec:mondrian_forests}
+
+We define the basic Mondrian random forest estimator
+\eqref{eq:mondrian_estimator} as in \citet{lakshminarayanan2014mondrian} and
+\citet{mourtada2020minimax}, and will later extend it to a debiased version in
+Section~\ref{sec:mondrian_debiased}. For a lifetime parameter $\lambda > 0$ and
+forest
+size $B \geq 1$, let $\bT = (T_1, \ldots, T_B)$ be a Mondrian forest where
+$T_b \sim \cM\big([0,1]^d, \lambda\big)$ are i.i.d.\ Mondrian trees
+which are independent of the data. For $x \in [0,1]^d$, write
+$N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ for the number of
+samples in $T_b(x)$, with $\I$ denoting an indicator function. Then the
+Mondrian random forest estimator of $\mu(x)$ is
+%
+\begin{equation}
+\label{eq:mondrian_estimator}
+\hat\mu(x) = \frac{1}{B} \sum_{b=1}^B
+\frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}.
+\end{equation}
+%
+If there are no samples $X_i$ in $T_b(x)$ then $N_b(x) = 0$, so we define
+$0/0 = 0$ (see Section~\ref{sec:mondrian_app_proofs} for details). To ensure the
+bias and variance of the Mondrian random forest estimator converge to zero (see
+Section~\ref{sec:mondrian_inference}), and to avoid boundary issues, we impose
+some basic conditions on $x$, $\lambda$, and $B$ in
+Assumption~\ref{ass:mondrian_estimator}.
+
+\begin{assumption}[Mondrian random forest estimator]%
+\label{ass:mondrian_estimator}
+%
+Suppose $x \in (0,1)^d$ is an interior point of the support of $X_i$,
+$\frac{\lambda^d}{n} \to 0$,
+$\log \lambda \asymp \log n$,
+and $B \asymp n^{\xi}$ for some $\xi \in (0, 1)$,
+which may depend on the dimension $d$ and smoothness $\beta$.
+%
+\end{assumption}
+
+Assumption~\ref{ass:mondrian_estimator} implies that the size of the forest $B$
+grows
+with $n$. For the purpose of mitigating the computational burden, we suggest
+the sub-linear polynomial growth $B \asymp n^{\xi}$, satisfying the conditions
+imposed in our main results. Large forests usually do not present computational
+challenges in practice as the ensemble estimator is easily parallelizable over
+the trees. We emphasize places where this ``large forest'' condition is
+important to our theory as they arise throughout the chapter.
+
+\section{Inference with Mondrian random forests}%
+\label{sec:mondrian_inference}
+
+We begin with a bias--variance decomposition for the Mondrian random
+forest estimator:
+%
+\begin{align}
+\nonumber
+\hat\mu(x) - \mu(x)
+&=
+\Big( \hat\mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big]\Big)
++ \Big( \E \big[ \hat \mu(x) \mid \bX, \bT \big] - \mu(x)\Big) \\
+&=
+\nonumber
+\frac{1}{B} \sum_{b=1}^B
+\frac{\sum_{i=1}^n \varepsilon_i \, \I\big\{ X_i \in T_b(x) \big\}}
+{N_b(x)} \\
+\label{eq:mondrian_bias_variance}
+&\quad+
+\frac{1}{B} \sum_{b=1}^B
+\frac{\sum_{i=1}^n \big(\mu(X_i) - \mu(x)\big) \,
+\I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}.
+\end{align}
+%
+Our approach to inference is summarized as follows. Firstly, we provide a
+central limit theorem (weak convergence to a Gaussian) for the first
+``variance'' term in \eqref{eq:mondrian_bias_variance}. Secondly, we precisely
+compute
+the probability limit of the second ``bias'' term. By ensuring that the
+standard deviation dominates the bias, a corresponding
+central limit theorem holds for the Mondrian random forest. With an appropriate
+estimator for the limiting variance, we establish procedures for valid and
+feasible statistical inference on the unknown regression function $\mu(x)$.
+
+We begin with the aforementioned central limit theorem, which forms the core of
+our methodology for performing statistical inference. Before stating our main
+result, we highlight some of the challenges involved. At first glance, the
+summands in the first term in \eqref{eq:mondrian_bias_variance} seem to be
+independent
+over $1 \leq i \leq n$, conditional on the forest $\bT$, depending only on
+$X_i$ and $\varepsilon_i$. However, the $N_b(x)$ appearing in the denominator
+depends on all $X_i$ simultaneously, violating this independence assumption and
+rendering classical central limit theorems inapplicable. A natural preliminary
+attempt to resolve this issue is to observe that
+%
+\begin{equation*}
+N_b(x)= \sum_{i=1}^{n} \I\big\{X_i \in T_b(x)\big\}
+\approx n \, \P \big( X_i \in T_b(x) \mid T_b \big)
+\approx n f(x) |T_b(x)|
+\end{equation*}
+%
+with high probability. One could attempt to use this by approximating the
+estimator with an average of i.i.d.\ random variables, or by employing a
+central limit theorem conditional on $\bX$ and $\bT$. However, such an approach
+fails because $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$; the possible
+existence of small cells causes the law of the inverse cell volume to have
+heavy tails. For similar reasons, attempts to directly establish a central
+limit theorem based on $2 + \delta$ moments, such as the Lyapunov central limit
+theorem, are ineffective.
+
+We circumvent these problems by directly analyzing
+$\frac{\I\{N_b(x) \geq 1\}}{N_b(x)}$. We establish concentration properties for
+this non-linear function of $X_i$ via the Efron--Stein inequality
+\citep[Section 3.1]{boucheron2013concentration} along with a sequence of
+somewhat delicate preliminary lemmas regarding inverse moments of truncated
+(conditional) binomial random variables. In particular, we show that
+$\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)} \right]
+\lesssim \frac{\lambda^d}{n}$ and
+$\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)^2} \right]
+\lesssim \frac{\lambda^{2d} \log n}{n^2}$.
+Asymptotic normality is then established using a central limit theorem for
+martingale difference sequences \citep[Theorem~3.2]{hall1980martingale} with
+respect to an appropriate filtration.
+Section~\ref{sec:mondrian_overview_proofs} gives
+an overview our proof strategy in which we further discuss the underlying
+challenges, while Section~\ref{sec:mondrian_app_proofs} gives all the technical
+details.
+
+\subsection{Central limit theorem}
+\label{sec:mondrian_clt}
+
+Theorem~\ref{thm:mondrian_clt} gives our first main result.
+
+\begin{theorem}[Central limit theorem for the centered
+Mondrian random forest estimator]%
+\label{thm:mondrian_clt}
+%
+Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
+hold, and further assume that
+$\E[Y_i^4 \mid X_i ]$ is bounded almost surely
+and $\frac{\lambda^d \log n}{n} \to 0$. Then
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\Big( \hat \mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big] \Big)
+&\rightsquigarrow \cN\big(0, \Sigma(x)\big)
+& &\text{where}
+&\Sigma(x) &=
+\frac{\sigma^2(x)}{f(x)} \left( \frac{4 - 4 \log 2}{3 } \right)^d.
+\end{align*}
+\end{theorem}
+
+The condition of $B \to \infty$ is crucial, ensuring sufficient ``mixing'' of
+different Mondrian cells to escape the heavy-tailed phenomenon detailed in the
+preceding discussion. For concreteness, the large forest condition allows us to
+deal with expressions such as
+$\E \left[ \frac{1}{|T_b(x)| |T_{b'}(x)|} \right]
+= \E \left[ \frac{1}{|T_b(x)|} \right] \E \left[ \frac{1}{|T_{b'}(x)|} \right]
+\approx \lambda^{2d} < \infty$
+where $b \neq b'$, by independence of the trees, rather than the ``no
+ensembling'' single tree analog
+$\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$.
+
+We take this opportunity to contrast Mondrian random forests with more
+classical kernel-based smoothing methods. The lifetime $\lambda$ plays a
+similar role to the inverse bandwidth in determining the effective sample size
+$n / \lambda^d$, and thus the associated rate of convergence. However, due to
+the Mondrian process construction, some cells are typically ``too small''
+(equivalent to an insufficiently large bandwidth) to give an appropriate
+effective sample size. Similarly, classical methods based on non-random
+partitioning such as spline estimators \citep{huang2003local,cattaneo2020large}
+typically impose a quasi-uniformity assumption to ensure all the cells are of
+comparable size, a property which does not hold for the Mondrian process (not
+even with probability approaching one).
+
+\subsection*{Bias characterization}
+
+We turn to the second term in \eqref{eq:mondrian_bias_variance}, which captures
+the bias
+of the Mondrian random forest estimator conditional on the covariates $\bX$ and
+the forest $\bT$. As such, it is a random quantity which, as we will
+demonstrate, converges in probability. We precisely characterize the limiting
+non-random bias, including high-degree polynomials in $\lambda$ which for now
+may seem ignorable. Indeed the magnitude of the bias is determined by its
+leading term, typically of order $1/\lambda^2$ whenever $\beta \geq 2$, and
+this suffices for ensuring a negligible contribution from the bias with an
+appropriate choice of lifetime parameter. However, the advantage of specifying
+higher-order bias terms is made apparent in Section~\ref{sec:mondrian_debiased}
+when we
+construct a debiased Mondrian random forest estimator. There, we target and
+annihilate the higher-order terms in order to furnish superior estimation and
+inference properties.
+Theorem~\ref{thm:mondrian_bias} gives our main result on
+the bias of the Mondrian random forest estimator.
+
+\begin{theorem}[Bias of the Mondrian random forest estimator]%
+\label{thm:mondrian_bias}
+%
+Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
+hold.
+Then for each $1 \leq r \leq \lfloor \flbeta / 2 \rfloor$ there exists
+$B_r(x) \in \R$, which is a function only of
+the derivatives of $f$ and $\mu$ at $x$ up to order $2r$, with
+%
+\begin{equation*}
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+= \mu(x)
++ \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
+\frac{B_r(x)}{\lambda^{2r}}
++ O_\P \left(
+\frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{equation*}
+%
+Whenever $\beta > 2$ the leading bias is the quadratic term
+%
+\begin{equation*}
+\frac{B_1(x)}{\lambda^2}
+=
+\frac{1}{2 \lambda^2}
+\sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2}
++ \frac{1}{2 \lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}.
+\end{equation*}
+%
+If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$,
+and using multi-index notation we have
+%
+\begin{equation*}
+\frac{B_r(x)}{\lambda^{2r}}
+= \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x)
+\prod_{j=1}^d \frac{1}{\nu_j + 1}.
+\end{equation*}
+%
+\end{theorem}
+
+In Theorem~\ref{thm:mondrian_bias} we give some explicit examples of
+calculating the
+limiting bias if $\beta > 2$ or when $X_i$ are uniformly distributed. The
+general form of $B_r(x)$ is provided in Section~\ref{sec:mondrian_app_proofs}
+but
+is somewhat unwieldy except in specific situations. Nonetheless the most
+important properties are that $B_r(x)$ are non-random and do not depend on the
+lifetime $\lambda$, crucial facts for our debiasing procedure given in
+Section~\ref{sec:mondrian_debiased}. If the forest size $B$ does not diverge to
+infinity
+then we suffer the first-order bias term $\frac{1}{\lambda \sqrt B}$. This
+phenomenon was explained by \citet{mourtada2020minimax}, who noted that it
+allows single Mondrian trees to achieve minimax optimality only when
+$\beta \in (0, 1]$. Large forests remove this first-order bias
+and are optimal for all $\beta \in (0, 2]$.
+
+Using Theorem~\ref{thm:mondrian_clt} and Theorem~\ref{thm:mondrian_bias}
+together,
+along with an appropriate choice of lifetime parameter $\lambda$,
+gives a central limit theorem for the Mondrian random forest estimator
+which can be used, for example, to build confidence intervals
+for the unknown regression function $\mu(x)$
+whenever the bias shrinks faster than the standard deviation.
+In general this will require
+$\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
+\ll \sqrt{\frac{\lambda^d}{n}}$,
+which can be satisfied by imposing the restrictions
+$\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$
+and $B \gg n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$
+on the lifetime $\lambda$ and forest size $B$.
+If instead we aim for optimal point estimation,
+then balancing the bias and standard deviation requires
+$\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
+\asymp \sqrt{\frac{\lambda^d}{n}}$,
+which can be satisfied by
+$\lambda \asymp n^{\frac{1}{d + 2(2 \wedge \beta)}}$
+and $B \gtrsim n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$.
+Such a choice of $\lambda$ gives the convergence rate
+$n^{\frac{-(2 \wedge \beta)}{d + 2(2 \wedge \beta)}}$
+which is the minimax-optimal rate of convergence \citep{stone1982optimal}
+for $\beta$-H{\"o}lder functions with $\beta \in (0,2]$
+as shown by \citet[Theorem~2]{mourtada2020minimax}.
+In Section~\ref{sec:mondrian_debiased} we will show how the Mondrian random
+forest
+estimator can be debiased, giving both weaker lifetime conditions for inference
+and also improved rates of convergence, under additional smoothness assumptions.
+
+\subsection*{Variance estimation}
+
+The limiting variance $\Sigma(x)$ from the resulting central limit theorem
+depends on the unknown quantities $\sigma^2(x)$ and $f(x)$.
+To conduct feasible inference, we must therefore first estimate
+$\Sigma(x)$. To this end, define
+%
+\begin{align}
+\label{eq:mondrian_sigma2_hat}
+\hat\sigma^2(x)
+&=
+\frac{1}{B} \sum_{b=1}^{B} \sum_{i=1}^n
+\frac{\big(Y_i - \hat \mu(x)\big)^2 \, \I\{X_i \in T_b(x)\}} {N_b(x)}, \\
+\nonumber
+\hat\Sigma(x)
+&=
+\hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n
+\left( \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_b(x)\}}{N_b(x)} \right)^2.
+\end{align}
+%
+In Theorem~\ref{thm:mondrian_variance_estimation} we show that this
+estimator is consistent, and establish its rate of convergence.
+%
+\begin{theorem}[Variance estimation]%
+\label{thm:mondrian_variance_estimation}
+Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
+and
+suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then
+%
+\begin{align*}
+\hat\Sigma(x)
+= \Sigma(x)
++ O_\P \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}}
+\right).
+\end{align*}
+
+\end{theorem}
+
+\subsection{Confidence intervals}
+
+Theorem~\ref{thm:mondrian_confidence} shows how to construct valid confidence
+intervals
+for the regression function $\mu(x)$ under the lifetime and forest size
+assumptions previously discussed. For details on feasible and practical
+selection of the lifetime parameter $\lambda$, see
+Section~\ref{sec:mondrian_parameter_selection}.
+%
+\begin{theorem}[Feasible confidence intervals using a Mondrian random forest]%
+\label{thm:mondrian_confidence}
+%
+Suppose that Assumptions~\ref{ass:mondrian_data} and
+\ref{ass:mondrian_estimator} hold,
+$\E[Y_i^4 \mid X_i ]$ is bounded almost surely,
+and $\frac{\lambda^d \log n}{n} \to 0$. Assume that
+$\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$
+and $B \gg n^{\frac{2 (2 \wedge \beta) - 2}{d + 2 (2 \wedge \beta)}}$.
+For a confidence level $\alpha \in (0, 1)$,
+let $q_{1 - \alpha / 2}$ be the normal quantile satisfying
+$\P \left( \cN(0, 1) \leq q_{1 - \alpha / 2} \right) = 1 - \alpha / 2$. Then
+%
+\begin{align*}
+\P \left(
+\mu(x) \in
+\left[
+\hat \mu(x)
+- \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2}
+q_{1 - \alpha / 2}, \
+\hat \mu(x)
++ \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2}
+q_{1 - \alpha / 2}
+\right]
+\right)
+\to
+1 - \alpha.
+\end{align*}
+
+\end{theorem}
+
+When coupled with an appropriate lifetime selection method,
+Theorem~\ref{thm:mondrian_confidence} gives a fully feasible procedure for
+uncertainty
+quantification in Mondrian random forests. Our procedure requires no adjustment
+of the original Mondrian random forest estimator beyond ensuring that the bias
+is negligible, and in particular does not rely on sample splitting. The
+construction of confidence intervals is just one corollary of the weak
+convergence result given in Theorem~\ref{thm:mondrian_clt}, and follows
+immediately from Slutsky's theorem
+\citep[Chapter~7]{pollard2002user}
+with a consistent variance estimator. Other applications
+include hypothesis testing on the value of $\mu(x)$ at a design point $x$ by
+inversion of the confidence interval, as well as parametric specification
+testing by comparison with a $\sqrt{n}$-consistent parametric regression
+estimator. The construction of simultaneous confidence intervals for finitely
+many points $x_1, \ldots, x_D$ can be accomplished either using standard
+multiple testing corrections or by first establishing a multivariate central
+limit theorem using the Cram{\'e}r--Wold device
+\citep[Chapter~8]{pollard2002user}
+and formulating a consistent multivariate variance estimator.
+
+\section{Overview of proof strategy}%
+\label{sec:mondrian_overview_proofs}
+
+This section provides some insight into the general approach we use to
+establish the main results in the preceding sections. We focus on the technical
+innovations forming the core of our arguments, and refer the reader to
+Section~\ref{sec:mondrian_app_proofs} for detailed proofs, including those for
+the
+debiased estimator discussed in the upcoming
+Section~\ref{sec:mondrian_debiased}.
+
+\subsection*{Preliminary results}
+
+The starting point for our proofs is a characterization of the exact
+distribution of the shape of a Mondrian cell $T(x)$. This property is a direct
+consequence of the fact that the restriction of a Mondrian process to a subcell
+remains Mondrian \citep[Fact~2]{mourtada2020minimax}. We have
+%
+\begin{align*}
+|T(x)_j|
+&= \left( \frac{E_{j1}}{\lambda} \wedge x_j \right)
++ \left( \frac{E_{j2}}{\lambda} \wedge (1-x_j) \right)
+\end{align*}
+%
+for all $1 \leq j \leq d$, recalling that $T(x)_j$ is the side of the cell
+$T(x)$ aligned with axis $j$, and where $E_{j1}$ and $E_{j2}$ are mutually
+independent $\Exp(1)$ random variables. Our assumptions that $x \in (0,1)$ and
+$\lambda \to \infty$ make the boundary terms $x_j$ and $1-x_j$
+eventually ignorable so
+%
+\begin{align*}
+|T(x)_j| &= \frac{E_{j1} + E_{j2}}{\lambda}
+\end{align*}
+%
+with high probability. Controlling the size of the largest cell in the forest
+containing $x$ is now straightforward with a union bound, exploiting the sharp
+tail decay of the exponential distribution, and thus
+%
+\begin{align*}
+\max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j|
+\lesssim_\P \frac{\log B}{\lambda}.
+\end{align*}
+%
+This shows that up to logarithmic terms, none of the cells in the forest at $x$
+are significantly larger than average, ensuring that the Mondrian random forest
+estimator is localized around $x$ on the scale of $1/\lambda$, an important
+property for the upcoming bias characterization.
+
+Having provided upper bounds for the sizes of Mondrian cells, we also must
+establish some lower bounds in order to quantify the ``small cell'' phenomenon
+mentioned previously. The first step towards this is to bound the first two
+moments of the truncated inverse Mondrian cell volume; we show that
+%
+\begin{align*}
+\E\left[ 1 \wedge \frac{1}{n |T(x)|} \right]
+&\asymp \frac{\lambda^d}{n}
+&&\text{and}
+&\frac{\lambda^{2d}}{n^2}
+&\lesssim
+\E\left[ 1 \wedge \frac{1}{n^2 |T(x)|^2} \right]
+\lesssim \frac{\lambda^{2d} \log n}{n^2}.
+\end{align*}
+%
+These bounds are computed directly using the exact distribution of $|T(x)|$.
+Note that $\E\left[ \frac{1}{|T(x)|^2} \right] = \infty$ because
+$\frac{1}{E_{j1} + E_{j2}}$ has only $2 - \delta$ finite moments, so the
+truncation is crucial here. Since we nearly have two moments, this
+truncation is at the expense of only a logarithmic term. Nonetheless, third and
+higher truncated moments will not enjoy such tight bounds, demonstrating both
+the fragility of this result and the inadequacy of tools such as the Lyapunov
+central limit theorem which require $2 + \delta$ moments.
+
+To conclude this investigation into the small cell phenomenon, we apply the
+previous bounds to ensure that the empirical effective sample sizes
+$N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ are approximately
+of the order $n / \lambda^d$ in an appropriate sense; we demonstrate that
+%
+\begin{align*}
+\E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \right]
+&\lesssim \frac{\lambda^d}{n}
+&&\text{and}
+&\E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)^2} \right]
+&\lesssim \frac{\lambda^{2d} \log n}{n^2},
+\end{align*}
+%
+as well as similar bounds for mixed terms such as
+%
+$\E \left[
+\frac{\I\{N_b(x) \geq 1\}}{N_b(x)}
+\frac{\I\{N_{b'}(x) \geq 1\}}{N_{b'}(x)}
+\right]
+\lesssim \frac{\lambda^{2d}}{n^2}$
+%
+when $b \neq b'$, which arise from covariance terms across multiple trees. The
+proof of this result is involved and technical, and proceeds by induction. The
+idea is to construct a class of subcells by taking all possible intersections
+of the cells in $T_b$ and $T_{b'}$ (we show two trees here for clarity; there
+may be more) and noting that each $N_b(x)$ is the sum of the number of points
+in each such refined cell intersected with $T_b(x)$. We then swap out each
+refined cell one at a time and replace the number of data points it contains
+with its volume multiplied by $n f(x)$, showing that the expectation on the
+left hand side does not increase too much using a moment bound for inverse
+binomial random variables based on Bernstein's inequality. By induction and
+independence of the trees, eventually the problem is reduced to computing
+moments of truncated inverse Mondrian cell volumes, as above.
+
+\subsection*{Central limit theorem}
+
+To prove our main central limit theorem result
+(Theorem~\ref{thm:mondrian_clt}), we use
+the martingale central limit theorem given by
+\citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define
+$\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and
+$(\varepsilon_j : 1 \leq j \leq i)$, noting that
+$\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases as $n$ increases.
+Define the $\cH_{n i}$-measurable and square integrable variables
+%
+\begin{align*}
+S_i(x) &=
+\sqrt{\frac{n}{\lambda^d}} \frac{1}{B} \sum_{b=1}^B
+\frac{\I \{X_i \in T_b(x)\} \varepsilon_i} {N_{b}(x)},
+\end{align*}
+%
+which satisfy the martingale difference property
+$\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further,
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\big(
+\hat\mu(x)
+- \E\left[
+\hat\mu(x) \mid \bX, \bT
+\right]
+\big)
+= \sum_{i=1}^n S_i(x).
+\end{align*}
+%
+To establish weak convergence to $\cN\big(0, \Sigma(x)\big)$,
+it suffices to check that $\max_i |S_i(x)| \to 0$ in probability,
+$\E\left[\max_i S_i(x)^2\right] \lesssim 1$,
+and $\sum_i S_i(x)^2 \to \Sigma(x)$ in probability.
+Checking the first two of these is straightforward given the denominator moment
+bounds derived above. For the third condition, we demonstrate that
+$\sum_i S_i(x)^2$ concentrates by checking its variance is vanishing. To do
+this, first observe that $S_i(x)^2$ is the square of a sum over the $B$ trees.
+Expanding this square, we see that the diagonal terms (where $b = b'$) provide
+a negligible contribution due to the large forest assumption. For the other
+terms, we apply the law of total variance and the moment bounds detailed
+earlier. Here, it is crucial that $b \neq b'$ in order to exploit the
+independence of the trees and avoid having to control any higher moments. The
+law of total variance requires that we bound
+%
+\begin{align*}
+\Var \left[
+\E \left[
+\sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b}
+\frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2}
+{N_{b}(x) N_{b'}(x)} \Bigm| \bX, \bY
+\right]
+\right],
+\end{align*}
+%
+which is the variance of a non-linear function of the i.i.d.\ variables
+$(X_i, \varepsilon_i)$, and so we apply the Efron--Stein inequality.
+The important insight here is that replacing a sample
+$(X_i, \varepsilon_i)$ with an independent copy
+$(\tilde X_i, \tilde \varepsilon_i)$ can change the value of
+$N_b(x)$ by at most one. Further, this can happen only on the event
+$\{ X_i \in T_{b}(x) \} \cup \{ \tilde X_i \in T_{b}(x) \}$,
+which occurs with probability on the order $1/\lambda^d$
+(the expected cell volume).
+
+The final part of the central limit theorem proof is to calculate the limiting
+variance $\Sigma(x)$. The penultimate step showed that we must have
+%
+\begin{align*}
+\Sigma(x)
+&= \lim_{n \to \infty} \sum_{i=1}^n \E \left[S_i(x)^2 \right]
+= \lim_{n \to \infty}
+\frac{n^2}{\lambda^d} \,
+\E \left[
+\frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2}
+{N_{b}(x) N_{b'}(x)}
+\right],
+\end{align*}
+%
+assuming the limit exists, so it remains to check this and calculate the limit.
+It is a straightforward but tedious exercise to verify that each term can be
+replaced with its conditional expectation given $T_b$ and $T_{b'}$, using some
+further properties of the binomial and exponential distributions. This yields
+%
+\begin{align*}
+\Sigma(x)
+&=
+\frac{\sigma^2(x)}{f(x)}
+\lim_{\lambda \to \infty}
+\frac{1}{\lambda^d}
+\E \left[
+\frac{|T_{b}(x) \cap T_{b'}(x)|}
+{|T_{b}(x)| \, |T_{b'}(x)|}
+\right]
+= \frac{\sigma^2(x)}{f(x)}
+\E \left[
+\frac{(E_{1} \wedge E'_{1}) + (E_{2} \wedge E'_{2})}
+{(E_{1} + E_{2}) (E'_{1} + E'_{2})}
+\right]^d
+\end{align*}
+%
+where $E_1$, $E_2$, $E'_1$, and $E'_2$ are independent $\Exp(1)$,
+by the cell shape distribution and independence of the trees. This final
+expectation is calculated by integration, using various incomplete gamma
+function identities.
+
+\subsection*{Bias characterization}
+
+Our second substantial technical result is the bias characterization
+given as Theorem~\ref{thm:mondrian_bias}, in which we precisely
+characterize the probability limit of the conditional bias
+%
+\begin{align*}
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \mu(x)
+&=
+\frac{1}{B} \sum_{b=1}^B
+\sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
+\frac{\I\{X_i \in T_b(x)\}}{N_b(x)}.
+\end{align*}
+%
+The first step is to pass to the ``infinite forest''
+limit by taking an expectation conditional on $\bX$, or equivalently
+marginalizing over $\bT$, applying the conditional Markov inequality
+to see
+%
+\begin{align*}
+\big|
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \E \left[ \hat \mu(x) \mid \bX \right]
+\big|
+&\lesssim_\P
+\frac{1}{\lambda \sqrt B}.
+\end{align*}
+%
+While this may seem a crude approximation, it is already known that fixed-size
+Mondrian forests have suboptimal bias properties when compared to forests with
+a diverging number of trees. In fact, the error $\frac{1}{\lambda \sqrt B}$
+exactly accounts for the first-order bias of individual Mondrian trees noted by
+\citet{mourtada2020minimax}.
+
+Next we show that $\E \left[ \hat \mu(x) \mid \bX \right]$ converges in
+probability to its expectation, again using the Efron--Stein theorem for this
+non-linear function of the i.i.d.\ variables $X_i$. The Lipschitz property of
+$\mu$ and the upper bound on the maximum cell size give
+$|\mu(X_i) - \mu(x)| \lesssim \max_{1 \leq j \leq d} |T_b(x)_j|
+\lesssim_\P \frac{\log B}{\lambda}$
+whenever $X_i \in T_b(x)$,
+so we combine this with moment bounds for the denominator $N_b(x)$ to see
+%
+\begin{align*}
+\left|
+\E \left[ \hat \mu(x) \mid \bX \right]
+- \E \left[ \hat \mu(x) \right]
+\right|
+\lesssim_\P
+\frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}.
+\end{align*}
+
+The next step is to approximate the resulting non-random bias
+$\E \left[ \hat \mu(x) \right] - \mu(x)$ as a polynomial in $1/\lambda$.
+To this end, we firstly apply a concentration-type result for the binomial
+distribution to deduce that
+%
+\begin{align*}
+\E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \Bigm| \bT \right]
+\approx \frac{1}{n \int_{T_b(x)} f(s) \diff s}
+\end{align*}
+%
+in an appropriate sense, and hence,
+by conditioning on $\bT$ and $\bX$ without $X_i$, we write
+%
+\begin{align}
+\label{eq:mondrian_bias_ratio}
+\E \left[ \hat \mu(x) \right] - \mu(x)
+&\approx
+\E \left[
+\frac{\int_{T_b(x)} (\mu(s) - \mu(x)) f(s) \diff s}
+{\int_{T_b(x)} f(s) \diff s}
+\right].
+\end{align}
+%
+Next we apply the multivariate version of Taylor's theorem to the integrands in
+both the numerator and the denominator in \eqref{eq:mondrian_bias_ratio}, and
+then apply
+the Maclaurin series of $\frac{1}{1+x}$ and the multinomial theorem to recover
+a single polynomial in $1/\lambda$. The error term is on the order of
+$1/\lambda^\beta$ and depends on the smoothness of $\mu$ and $f$, and the
+polynomial coefficients are given by various expectations involving exponential
+random variables. The final step is to verify using symmetry of Mondrian cells
+that all the odd monomial coefficients are zero, and to calculate some explicit
+examples of the form of the limiting bias.
+
+\section{Debiased Mondrian random forests}%
+\label{sec:mondrian_debiased}
+
+In this section we give our next main contribution, proposing a variant of the
+Mondrian random forest estimator which corrects for higher-order bias with an
+approach based on generalized jackknifing \citep{schucany1977improvement}. This
+estimator retains the basic form of a Mondrian random forest estimator in the
+sense that it is a linear combination of Mondrian tree estimators, but in this
+section we allow for non-identical linear coefficients, some of which may be
+negative, and for differing lifetime parameters across the trees. Since the
+basic Mondrian random forest estimator is a special case of this more general
+debiased version, we will discuss only the latter throughout the rest of the
+chapter.
+
+We use the explicit form of the bias given in Theorem~\ref{thm:mondrian_bias} to
+construct a debiased version of the Mondrian forest estimator. Let $J \geq 0$
+be the bias correction order. As such, with $J=0$ we retain the original
+Mondrian forest estimator, with $J=1$ we remove second-order bias, and with
+$J = \lfloor\flbeta / 2 \rfloor$ we remove bias terms up to and including order
+$2 \lfloor\flbeta / 2 \rfloor$, giving the maximum possible bias reduction
+achievable in the H{\"o}lder class $\cH^\beta$. As such, only bias terms of
+order $1/\lambda^\beta$ will remain.
+
+For $0 \leq r \leq J$ let $\hat \mu_r(x)$ be a Mondrian forest estimator
+based on the trees $T_{b r} \sim \cM\big([0,1]^d, \lambda_r \big)$
+for $1 \leq b \leq B$, where $\lambda_r = a_r \lambda$ for some $a_r > 0$
+and $\lambda > 0$. Write $\bT$ to denote the collection of all the trees,
+and suppose they are mutually independent. We find values of $a_r$ along with
+coefficients $\omega_r$ in order to annihilate the leading $J$ bias terms of
+the debiased Mondrian random forest estimator
+%
+\begin{align}
+\label{eq:mondrian_debiased}
+\hat \mu_\rd(x)
+&= \sum_{r=0}^J \omega_r \hat \mu_r(x)
+= \sum_{r=0}^{J} \omega_r
+\frac{1}{B} \sum_{b=1}^B
+\frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_{r b}(x) \big\}} {N_{r b}(x)}.
+\end{align}
+%
+This ensemble estimator retains the ``forest'' structure of the original
+estimators, but with varying lifetime parameters $\lambda_r$ and coefficients
+$\omega_r$. Thus by Theorem~\ref{thm:mondrian_bias} we want to solve
+%
+\begin{align*}
+\sum_{r=0}^{J} \omega_r
+\left( \mu(x) + \sum_{s=1}^{J} \frac{B_{s}(x)}{a_r^{2s} \lambda^{2s}} \right)
+&= \mu(x)
+\end{align*}
+%
+for all $\lambda$, or equivalently the system of linear equations
+$\sum_{r=0}^{J} \omega_r = 1$
+and $\sum_{r=0}^{J} \omega_r a_r^{-2s} = 0$ for each $1 \leq s \leq J$.
+We solve these as follows. Define the $(J+1) \times (J+1)$ Vandermonde matrix
+$A_{r s} = a_{r-1}^{2-2s}$,
+and let $\omega = (\omega_0, \ldots, \omega_J)^\T \in \R^{J+1}$
+and $e_0 = (1, 0, \ldots, 0)^\T \in \R^{J+1}$.
+Then a solution for the debiasing coefficients is given by
+$\omega = A^{-1} e_0$ whenever $A$ is non-singular.
+In practice we can take $a_r$ to be a fixed geometric or arithmetic sequence
+to ensure this is the case, appealing to the Vandermonde determinant formula:
+$\det A = \prod_{0 \leq r < s \leq J} (a_r^{-2} - a_s^{-2})
+\neq 0$ whenever $a_r$ are distinct. For example, we could set
+$a_r = (1 + \gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$.
+Because we assume $\beta$, and therefore the choice of $J$, do not
+depend on $n$, there is no need to quantify
+the invertibility of $A$ by, for example, bounding its eigenvalues
+away from zero as a function of $J$.
+
+\subsection{Central limit theorem}
+
+In Theorem~\ref{thm:mondrian_clt_debiased}, we verify that a central
+limit theorem holds for the debiased
+random forest estimator $\hat\mu_\rd(x)$ and give its limiting variance.
+The strategy and challenges associated with proving
+Theorem~\ref{thm:mondrian_clt_debiased} are identical to those discussed earlier
+surrounding Theorem~\ref{thm:mondrian_clt}. In fact in
+Section~\ref{sec:mondrian_app_proofs}
+we provide a direct proof only for Theorem~\ref{thm:mondrian_clt_debiased}
+and deduce Theorem~\ref{thm:mondrian_clt} as a special case.
+
+\begin{theorem}[Central limit theorem for the
+debiased Mondrian random forest estimator]%
+\label{thm:mondrian_clt_debiased}
+%
+Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
+hold,
+$\E[Y_i^4 \mid X_i ]$ is bounded,
+and $\frac{\lambda^d \log n}{n} \to 0$. Then
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\Big(
+\hat \mu_\rd(x)
+- \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
+\Big)
+&\rightsquigarrow
+\cN\big(0, \Sigma_\rd(x)\big)
+\end{align*}
+%
+where, with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}}
+\log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$,
+the limiting variance is
+%
+\begin{align*}
+\Sigma_\rd(x)
+&=
+\frac{\sigma^2(x)}{f(x)}
+\sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d.
+\end{align*}
+%
+\end{theorem}
+
+It is easy to verify that in the case of no debiasing we have
+$J=0$ and $a_0 = \omega_0 = 1$, yielding
+$\Sigma_\rd(x) = \Sigma(x)$, and recovering Theorem~\ref{thm:mondrian_clt}.
+
+\subsection*{Bias characterization}
+
+In Theorem~\ref{thm:mondrian_bias_debiased} we verify that this debiasing
+procedure does indeed annihilate the desired bias terms, and its proof is a
+consequence of Theorem~\ref{thm:mondrian_bias} and the construction of the
+debiased Mondrian random forest estimator $\hat\mu_\rd(x)$.
+
+\begin{theorem}[Bias of the debiased Mondrian random forest estimator]%
+\label{thm:mondrian_bias_debiased}
+Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}.
+In the notation of Theorem~\ref{thm:mondrian_bias} with
+$\bar\omega = \sum_{r=0}^J \omega_r a_r^{-2J - 2}$,
+%
+\begin{align*}
+\E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
+&= \mu(x) + \I\{2J+2 < \beta \}
+\frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} \\
+&\quad+
+O_\P \left(
+\frac{1}{\lambda^{2J + 4}}
++ \frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{align*}
+%
+\end{theorem}
+
+Theorem~\ref{thm:mondrian_bias_debiased} has the following consequence:
+the leading bias term is characterized in terms of
+$B_{J+1}(x)$ whenever $J < \beta/2 - 1$,
+or equivalently $J < \lfloor \flbeta/2 \rfloor$,
+that is, the debiasing order
+$J$ does not exhaust the H{\"o}lder smoothness $\beta$.
+If this condition does not hold, then the estimator is
+fully debiased, and the resulting leading bias
+term is bounded above by $1/\lambda^\beta$ up to constants,
+but its form is left unspecified.
+
+\subsection*{Variance estimation}
+
+As before, we propose a variance estimator in order to conduct feasible
+inference and show that it is consistent.
+With $\hat\sigma^2(x)$ as in \eqref{eq:mondrian_sigma2_hat}
+in Section~\ref{sec:mondrian_inference}, define the estimator
+%
+\begin{align}
+\label{eq:mondrian_debiased_variance_estimator}
+\hat\Sigma_\rd(x)
+&=
+\hat\sigma^2(x)
+\frac{n}{\lambda^d}
+\sum_{i=1}^n
+\left(
+\sum_{r=0}^J
+\omega_r
+\frac{1}{B}
+\sum_{b=1}^B
+\frac{\I\{X_i \in T_{r b}(x)\}}
+{N_{r b}(x)}
+\right)^2.
+\end{align}
+%
+\begin{theorem}[Variance estimation]%
+\label{thm:mondrian_variance_estimation_debiased}
+Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
+and
+suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then
+%
+\begin{align*}
+\hat\Sigma_\rd(x)
+= \Sigma_\rd(x)
++ O_\P \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{1}{\sqrt B}
++ \sqrt{\frac{\lambda^d \log n}{n}}
+\right).
+\end{align*}
+%
+\end{theorem}
+
+\subsection{Confidence intervals}
+
+In analogy to Section~\ref{sec:mondrian_inference},
+we now demonstrate the construction of feasible valid confidence
+intervals using the debiased Mondrian random forest estimator
+in Theorem~\ref{thm:mondrian_confidence_debiased}.
+Once again we must ensure that the bias
+(now significantly reduced due to our debiasing procedure)
+is negligible when compared to the standard deviation
+(which is of the same order as before).
+We assume for simplicity that the estimator has been fully
+debiased by setting $J \geq \lfloor \flbeta / 2\rfloor$
+to yield a leading bias of order $1/\lambda^\beta$,
+but intermediate ``partially debiased'' versions can easily
+be provided, with leading bias terms of order
+$1/\lambda^{\beta \wedge (2J+2)}$ in general.
+We thus require
+$\frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B}
+\ll \sqrt{\frac{\lambda^d}{n}}$,
+which can be satisfied by imposing the restrictions
+$\lambda \gg n^{\frac{1}{d + 2 \beta}}$
+and $B \gg n^{\frac{2\beta - 2}{d + 2\beta}}$
+on the lifetime parameter $\lambda$
+and forest size $B$.
+
+\begin{theorem}[Feasible confidence intervals using a
+debiased Mondrian random forest]%
+\label{thm:mondrian_confidence_debiased}
+%
+Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
+hold,
+$\E[Y_i^4 \mid X_i ]$ is bounded,
+and $\frac{\lambda^d \log n}{n} \to 0$.
+Fix $J \geq \lfloor \flbeta / 2 \rfloor$ and assume that
+$\lambda \gg n^{\frac{1}{d + 2 \beta}}$
+and $B \gg n^{\frac{2 \beta - 2}{d + 2 \beta}}$.
+For a confidence level $\alpha \in (0, 1)$,
+let $q_{1 - \alpha / 2}$ be as in Theorem~\ref{thm:mondrian_confidence}. Then
+%
+\begin{align*}
+\P \left(
+\mu(x) \in
+\left[
+\hat \mu_\rd(x)
+- \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2}
+q_{1 - \alpha / 2}, \
+\hat \mu_\rd(x)
++ \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2}
+q_{1 - \alpha / 2}
+\right]
+\right)
+\to
+1 - \alpha.
+\end{align*}
+
+\end{theorem}
+
+One important benefit of our debiasing technique is made clear in
+Theorem~\ref{thm:mondrian_confidence_debiased}: the restrictions imposed on the
+lifetime
+parameter $\lambda$ are substantially relaxed, especially in smooth classes
+with large $\beta$. As well as the high-level of benefit of relaxed conditions,
+this is also useful for practical selection of appropriate lifetimes for
+estimation and inference respectively; see
+Section~\ref{sec:mondrian_parameter_selection} for more details. Nonetheless,
+such
+improvements do not come without concession. The limiting variance
+$\Sigma_\rd(x)$ of the debiased estimator is larger than that of the unbiased
+version (the extent of this increase depends on the choice of the debiasing
+parameters $a_r$), leading to wider confidence intervals and larger estimation
+error in small samples despite the theoretical asymptotic improvements.
+
+\subsection{Minimax optimality}
+
+Our final result Theorem~\ref{thm:mondrian_minimax} shows that,
+when using an appropriate sequence of lifetime parameters $\lambda$,
+the debiased Mondrian random forest estimator
+achieves, up to constants, the minimax-optimal rate of convergence
+for estimating a regression function $\mu \in \cH^\beta$
+in $d$ dimensions \citep{stone1982optimal}.
+This result holds for all $d \geq 1$ and all $\beta > 0$,
+complementing a previous result established only for $\beta \in (0, 2]$
+by \citet{mourtada2020minimax}.
+%
+\begin{theorem}[Minimax optimality of the debiased
+Mondrian random forest estimator]%
+\label{thm:mondrian_minimax}
+Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator},
+and let $J \geq \lfloor \flbeta / 2 \rfloor$,
+$\lambda \asymp n^{\frac{1}{d + 2 \beta}}$, and
+$B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$. Then
+%
+\begin{align*}
+\E \left[
+\big( \hat \mu_\rd(x) - \mu(x) \big)^2
+\right]^{1/2}
+&\lesssim
+\sqrt{\frac{\lambda^d}{n}}
++ \frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
+\lesssim
+n^{-\frac{\beta}{d + 2 \beta}}.
+\end{align*}
+%
+\end{theorem}
+
+The sequence of lifetime parameters $\lambda$ required in
+Theorem~\ref{thm:mondrian_minimax} are chosen to balance the bias and standard
+deviation bounds implied by Theorem~\ref{thm:mondrian_bias_debiased} and
+Theorem~\ref{thm:mondrian_clt_debiased} respectively, in order to minimize the
+pointwise
+mean squared error. While selecting an optimal debiasing order $J$ needs only
+knowledge of an upper bound on the smoothness $\beta$, choosing an optimal
+sequence of $\lambda$ values does assume that $\beta$ is known a priori. The
+problem of adapting to $\beta$ from data is challenging and beyond the scope of
+this chapter; we provide some practical advice for tuning parameter
+selection in Section~\ref{sec:mondrian_parameter_selection}.
+
+Theorem~\ref{thm:mondrian_minimax} complements the minimaxity results proven by
+\citet{mourtada2020minimax} for Mondrian trees (with $\beta \leq 1$) and for
+Mondrian random forests (with $\beta \leq 2$), with one modification: our
+version is stated in pointwise rather than integrated mean squared error. This
+is because our debiasing procedure is designed to handle interior smoothing
+bias and so does not provide any correction for boundary bias. We leave
+the development of such boundary corrections to future work, but constructions
+similar to higher-order boundary-correcting kernels should be possible. If the
+region of integration is a compact set in the interior of $[0,1]^d$, then we do
+obtain an optimal integrated mean squared error bound: if $\delta \in (0, 1/2)$
+is fixed then under the same conditions as Theorem~\ref{thm:mondrian_minimax},
+%
+\begin{align*}
+\E \left[
+\int_{[\delta, 1-\delta]^d}
+\big(
+\hat \mu_\rd(x)
+- \mu(x)
+\big)^2
+\diff x
+\right]^{1/2}
+&\lesssim
+\sqrt{\frac{\lambda^d}{n}}
++ \frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
+\lesssim
+n^{-\frac{\beta}{d + 2 \beta}}.
+\end{align*}
+
+The debiased Mondrian random forest estimator defined in
+\eqref{eq:mondrian_debiased} is
+a linear combination of Mondrian random forests, and as such contains both a
+sum over $0 \leq r \leq J$, representing the debiasing procedure, and a sum
+over $1 \leq b \leq B$, representing the forest averaging. We have thus far
+been interpreting this estimator as a debiased version of the standard Mondrian
+random forest given in \eqref{eq:mondrian_estimator}, but it is
+equally valid to swap the order of these sums. This gives rise to an
+alternative point of view: we replace each Mondrian random tree with a
+``debiased'' version, and then take a forest of such modified trees. This
+perspective is more in line with existing techniques for constructing
+randomized ensembles, where the outermost operation represents a $B$-fold
+average of randomized base learners, not necessarily locally constant decision
+trees, each of which has a small bias component \citep{caruana2004ensemble,
+zhou2019deep, friedberg2020local}.
+
+\section{Tuning parameter selection}%
+\label{sec:mondrian_parameter_selection}
+
+We discuss various procedures for selecting the parameters involved in fitting
+a debiased Mondrian random forest; namely the base lifetime parameter
+$\lambda$, the number of trees in each forest $B$, the bias correction order
+$J$, and the debiasing scale parameters $a_r$ for $0 \leq r \leq J$.
+
+\subsection{Selecting the base lifetime parameter
+\texorpdfstring{$\lambda$}{lambda}}%
+\label{sec:mondrian_lifetime_selection}
+
+The most important parameter is the base Mondrian lifetime parameter $\lambda$,
+which plays the role of a complexity parameter and thus governs the overall
+bias--variance trade-off of the estimator. Correct tuning of $\lambda$ is
+especially important in two main respects:
+%
+firstly, in order to use the central limit theorem established in
+Theorem~\ref{thm:mondrian_clt_debiased}, we must have that the bias converges
+to zero,
+requiring $\lambda \gg n^{\frac{1}{d + 2\beta}}$.
+%
+Secondly, the minimax optimality result of Theorem~\ref{thm:mondrian_minimax}
+is valid only in the regime $\lambda \asymp n^{\frac{1}{d + 2\beta}}$, and thus
+requires careful determination in the more realistic finite-sample setting. For
+clarity, in this section we use the notation $\hat\mu_\rd(x; \lambda, J)$ for
+the debiased Mondrian random forest with lifetime $\lambda$ and debiasing order
+$J$ as in \eqref{eq:mondrian_debiased}.
+Similarly, write $\hat\Sigma_\rd(x; \lambda, J)$ for the associated
+variance estimator given in \eqref{eq:mondrian_debiased_variance_estimator}.
+
+For minimax-optimal point estimation when $\beta$ is known,
+choose any sequence $\lambda \asymp n^{\frac{1}{d + 2\beta}}$
+and use $\hat\mu_\rd(x; \lambda, J)$ with $J = \lfloor \flbeta / 2 \rfloor$,
+following the theory given in Theorem~\ref{thm:mondrian_minimax}.
+For an explicit example of how to choose the lifetime, one can instead use
+$\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J-1\big)$
+so that the leading bias is explicitly characterized by
+Theorem~\ref{thm:mondrian_bias_debiased},
+and with $\hat\lambda_{\AIMSE}(J-1)$ as defined below.
+This is no longer minimax-optimal as $J-1 < J$
+does not satisfy the conditions of Theorem~\ref{thm:mondrian_minimax}.
+
+For performing inference, a more careful procedure is required;
+we suggest the following method assuming $\beta > 2$.
+Set $J = \lfloor \flbeta / 2 \rfloor$ as before,
+and use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$
+and $\hat\Sigma_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$
+to construct a confidence interval.
+The reasoning for this is that we select a lifetime tailored for a more biased
+estimator than we actually use. This results in an inflated lifetime estimate,
+guaranteeing the resulting bias is negligible when it is plugged into the fully
+debiased estimator. This approach to tuning parameter selection and debiasing
+for valid nonparametric inference corresponds to an application of robust bias
+correction \citep{calonico2018effect,calonico2022coverage},
+where the point estimator is bias-corrected
+and the robust standard error estimator incorporates the additional
+sampling variability introduced by the bias correction.
+This leads to a more refined distributional approximation
+but does not necessarily exhaust the underlying
+smoothness of the regression function.
+An alternative inference approach based on Lepskii's method
+\citep{lepskii1992asymptotically,birge2001alternative}
+could be developed with the latter goal in mind.
+
+It remains to propose a concrete method for computing $\hat\lambda_{\AIMSE}(J)$
+in the finite-sample setting; we suggest two such procedures based on plug-in
+selection with polynomial estimation and cross-validation respectively,
+building on classical ideas from the nonparametric
+smoothing literature \citep{fan2020statistical}.
+
+\subsubsection*{Lifetime selection with polynomial estimation}
+
+Firstly, suppose $X_i \sim \Unif\big([0,1]^d\big)$
+and that the leading bias of $\hat\mu_\rd(x)$ is well approximated by an
+additively separable function so that,
+writing $\partial^{2 J + 2}_j \mu(x)$
+for $\partial^{2 J + 2}_j \mu(x) / \partial x_j^{2 J + 2}$,
+%
+\begin{align*}
+\frac{\bar \omega B_{J+1}(x)}{\lambda^{2 J + 2}}
+&\approx
+\frac{1}{\lambda^{2 J + 2}}
+\frac{\bar \omega }{J + 2}
+\sum_{j=1}^d
+\partial^{2 J + 2}_j \mu(x).
+\end{align*}
+%
+Now suppose the model is homoscedastic so $\sigma^2(x) = \sigma^2$ and
+the limiting variance of $\hat\mu_\rd$ is
+%
+\begin{align*}
+\frac{\lambda^d}{n}
+\Sigma_\rd(x)
+&=
+\frac{\lambda^d \sigma^2}{n}
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d.
+\end{align*}
+%
+The asymptotic integrated mean squared error (AIMSE) is
+%
+\begin{align*}
+\AIMSE(\lambda, J)
+&=
+\frac{1}{\lambda^{4 J + 4}}
+\frac{\bar \omega^2}{(J + 2)^2}
+\int_{[0,1]^d}
+\left(
+\sum_{j=1}^d
+\partial^{2 J + 2}_j \mu(x)
+\right)^2
+\diff x \\
+&\quad+
+\frac{\lambda^d \sigma^2}{n}
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d.
+\end{align*}
+%
+Minimizing over $\lambda > 0$ yields the AIMSE-optimal lifetime parameter
+%
+\begin{align*}
+\lambda_{\AIMSE}(J)
+&=
+\left(
+\frac{
+\frac{(4 J + 4) \bar \omega^2}{(J + 2)^2}
+n \int_{[0,1]^d}
+\left(
+\sum_{j=1}^d
+\partial^{2 J + 2}_j \mu(x)
+\right)^2
+\diff x
+}{
+d \sigma^2
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d
+}
+\right)^{\frac{1}{4 J + 4 + d}}.
+\end{align*}
+%
+An estimator of $\lambda_{\AIMSE}(J)$ is therefore given by
+%
+\begin{align*}
+\hat\lambda_{\AIMSE}(J)
+&=
+\left(
+\frac{
+\frac{(4 J + 4) \bar \omega^2}{(J + 2)^2}
+\sum_{i=1}^n
+\left(
+\sum_{j=1}^d
+\partial^{2 J + 2}_j \hat\mu(X_i)
+\right)^2
+}{
+d \hat\sigma^2
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d
+}
+\right)^{\frac{1}{4 J + 4 + d}}
+\end{align*}
+%
+for some preliminary estimators
+$\partial^{2 J + 2}_j \hat\mu(x)$ and $\hat\sigma^2$.
+These can be obtained by fitting a global polynomial regression
+to the data of order $2 J + 4$ without interaction terms.
+To do this, define the $n \times ((2 J + 4)d + 1)$ design matrix $P$ with rows
+%
+\begin{align*}
+P_i = \big(
+1, X_{i1}, X_{i1}^2, \ldots, X_{i1}^{2 J + 4},
+X_{i2}, X_{i2}^2, \ldots, X_{i2}^{2 J + 4},
+\ldots,
+X_{id}, X_{id}^2, \ldots, X_{id}^{2 J + 4}
+\big),
+\end{align*}
+%
+and let
+%
+$P_x = \big(
+1, x_{1}, x_{1}^2, \ldots, x_{1}^{2 J + 4},
+x_{2}, x_{2}^2, \ldots, x_{2}^{2 J + 4},
+\ldots,
+x_{d}, x_{d}^2, \ldots, x_{d}^{2 J + 4}
+\big).
+$
+%
+Then we define the derivative estimator as
+%
+\begin{align*}
+\partial^{2 J + 2}_j \hat\mu(x)
+&=
+\partial^{2 J + 2}_j P_x
+\big( P^\T P \big)^{-1}
+P^\T \bY \\
+&=
+(2J + 2)!
+\left(
+0_{1 + (j-1)(2 J + 4) + (2J + 1)},
+1, x_j, x_j^2 / 2,
+0_{(d-j)(2 J + 4)}
+\right)
+\big( P^\T P \big)^{-1}
+P^\T \bY,
+\end{align*}
+%
+and the variance estimator $\hat\sigma^2$ is
+based on the residual sum of squared errors of this model:
+%
+\begin{align*}
+\hat\sigma^2
+&=
+\frac{1}{n - (2J + 4)d - 1}
+\big(
+\bY^\T \bY
+- \bY^\T P \big( P^\T P \big)^{-1} P^\T \bY
+\big).
+\end{align*}
+
+\subsubsection*{Lifetime selection with cross-validation}
+
+As an alternative to the analytic plug-in methods described above, one can use
+a cross-validation approach. While leave-one-out cross-validation (LOOCV) can
+be applied directly \citep{fan2020statistical},
+the linear smoother structure of the (debiased) Mondrian
+random forest estimator allows a computationally simpler formulation. Writing
+$\hat\mu_\rd^{-i}(x)$ for a debiased Mondrian random forest estimator fitted
+without the $i$th data sample, it is easy to show that
+%
+\begin{align*}
+\LOOCV(\lambda, J)
+&=
+\frac{1}{n}
+\sum_{i=1}^{n}
+\left( Y_i - \hat\mu_\rd^{-i}(X_i) \right)^2 \\
+&=
+\frac{1}{n}
+\sum_{i=1}^{n}
+\left(
+\sum_{r=0}^{J}
+\omega_r
+\frac{1}{B}
+\sum_{b=1}^{B}
+\frac{1}{1 - 1/N_{r b}(X_i)}
+\left( Y_i -
+\sum_{j=1}^{n}
+\frac{ Y_j \I \left\{ X_j \in T_{r b}(X_i) \right\}}
+{N_{r b}(X_i)}
+\right)
+\right)^{2},
+\end{align*}
+%
+avoiding refitting the model leaving each sample out in turn.
+Supposing $X_i \sim \Unif\big([0,1]^d\big)$ and
+replacing $1/N_{r b}(X_i)$ with their average expectation
+$ \frac{1}{J+1} \sum_{r=0}^{J} \E \left[ 1/N_{r b}(X_i) \right]
+\approx \bar a^d \lambda^d / n$
+where $\bar a^d = \frac{1}{J+1} \sum_{r=0}^{J} a_r^d$
+gives the generalized cross-validation (GCV) formula
+%
+\begin{align}
+\label{eq:mondrian_gcv}
+\GCV(\lambda, J)
+&=
+\frac{1}{n}
+\sum_{i=1}^{n}
+\left(
+\frac{Y_i - \hat\mu_\rd(X_i)}
+{1 - \bar a^d \lambda^d / n}
+\right)^2.
+\end{align}
+%
+The lifetime can then be selected by computing
+either $\hat\lambda_{\LOOCV} \in \argmin_\lambda \LOOCV(\lambda, J)$
+or $\hat\lambda_{\GCV} \in \argmin_\lambda \GCV(\lambda, J)$.
+See Section~\ref{sec:mondrian_weather} for a practical illustration.
+
+\subsection{Choosing the other parameters}
+
+\subsubsection*{The number \texorpdfstring{$B$}{B} of trees in each forest}%
+
+If no debiasing is applied, we suggest
+$B = \sqrt{n}$ to satisfy
+Theorem~\ref{thm:mondrian_confidence}.
+If debiasing is used then we recommend
+$B = n^{\frac{2J-1}{2J}}$, consistent with
+Theorem~\ref{thm:mondrian_confidence_debiased}
+and Theorem~\ref{thm:mondrian_minimax}.
+
+\subsubsection*{The debiasing order \texorpdfstring{$J$}{J}}%
+
+When debiasing a Mondrian random forest, one must decide
+how many orders of bias to remove. This requires some
+oracle knowledge of the H{\"o}lder smoothness of $\mu$ and $f$, which is
+difficult to estimate statistically. As such, we recommend
+removing only the first one or two bias terms, taking $J \in \{0,1,2\}$ to
+avoid overly inflating the variance of the estimator.
+
+\subsubsection*{The debiasing coefficients \texorpdfstring{$a_r$}{ar}}%
+
+As in Section~\ref{sec:mondrian_debiased}, we take $a_r$ to be a fixed
+geometric or arithmetic sequence. For example, one could set
+$a_r = (1+\gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$.
+We suggest taking $a_r = 1.05^r$.
+
+\section{Illustrative example: weather forecasting}%
+\label{sec:mondrian_weather}
+
+To demonstrate our methodology for estimation and inference with Mondrian random
+forests, we consider a simple application
+to a weather forecasting problem. We emphasize that the main aim of this
+section is to provide intuition and understanding for how a Mondrian random
+forest may be used in practice, and we refrain from an in-depth analysis of the
+specific results obtained. Indeed, our assumption of i.i.d.\ data is
+certainly violated with weather data, due to the time-series
+structure of sequential observations.
+Nonetheless, we use data from the \citet{bureau2017daily}, containing daily
+weather information from 2007--2017, at 49 different
+locations across Australia, with $n = 125\,927$ samples.
+
+\begin{figure}[b!]
+\centering
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_data.png}%
+\end{subfigure}
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_data_filled_partition.png}%
+\end{subfigure}
+\caption[Australian weather forecasting data]{
+Australian weather forecasting data. Left: colors indicate the response
+variable of dry (orange) or wet (blue) on the following
+day. Right: the data is overlaid with a Mondrian random tree,
+fitted with a lifetime of $\lambda = 5$
+selected by generalized cross-validation. Cell colors represent the response
+proportions.}
+\label{fig:mondrian_weather_data}
+\end{figure}
+
+We consider the classification problem of predicting whether or not it will
+rain on the following day using two covariates: the percentage relative
+humidity, and the pressure in mbar, both at 3pm on the current day. For the
+purpose of framing this as a nonparametric regression problem, we consider
+estimating the probability of rain as the regression function by setting
+$Y_i = 1$ if there is rain on the following day and $Y_i = 0$ otherwise.
+Outliers with pressure less than 985\,mbar or more than 1040\,mbar are removed
+to justify the assertion in Assumption~\ref{ass:mondrian_data} that the density
+of the covariates should be bounded away from zero, and the features are
+linearly scaled to provide normalized samples
+$(X_i, Y_i) \in [0, 1]^2 \times \{0, 1\}$.
+We fit a Mondrian random forest to the data as defined in
+Section~\ref{sec:mondrian_forests}, selecting the lifetime parameter with the
+generalized cross-validation (GCV) method detailed in
+Section~\ref{sec:mondrian_lifetime_selection}.
+
+Figure~\ref{fig:mondrian_weather_data} plots the
+data, using colors to indicate the response values, and illustrates how a
+single Mondrian tree is fitted by sampling from an independent Mondrian process
+and then computing local averages (equivalent to response proportions in this
+special setting with binary outcomes) within each cell. The general pattern of
+rain being predicted by high humidity and low pressure is apparent, with the
+preliminary tree estimator taking the form of a step function on axis-aligned
+rectangles. This illustrates the first-order bias of Mondrian random trees
+discussed in Section~\ref{sec:mondrian_clt}, with the piecewise constant
+estimator providing a poor approximation for the smooth true regression
+function.
+
+\begin{figure}[b!]
+\centering
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_forest_2.png}%
+\end{subfigure}
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_forest_design.png}%
+\end{subfigure}
+\caption[Fitting Mondrian random forests to the Australian weather data]{
+Fitting Mondrian random forests to the Australian weather data.
+Left: with $B=2$ trees, individual cells are clearly visible and the step
+function persists. Right: with $B=40$ trees, the estimate is much smoother
+as the individual tree estimates average out.
+Three design points are identified for further analysis.}
+\label{fig:mondrian_weather_forest}
+\end{figure}
+
+Figure~\ref{fig:mondrian_weather_forest} adds more trees to the estimator,
+demonstrating the effect of increasing the forest size first to $B=2$
+and then to $B=40$.
+As more trees are included in the Mondrian random forest,
+the regression estimate $\hat \mu(x)$ becomes smoother and therefore also
+enjoys improved bias properties as shown in
+Theorem~\ref{thm:mondrian_bias}, assuming a correct model specification.
+We also choose three specific design points in the
+(humidity, pressure) covariate space,
+namely (20\%, 1020\,mbar), (70\%, 1000\,mbar), and (80\%, 990\,mbar),
+at which to conduct inference
+by constructing confidence intervals. See Table~\ref{tab:mondrian_weather_ci}
+for the results.
+
+\begin{figure}[b!]
+\centering
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_gcv.png}%
+\end{subfigure}
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/weather_debiased_forest_design.png}%
+\end{subfigure}
+\caption[Cross-validation and debiasing for the Australian weather data]{
+Left: mean squared error and generalized cross-validation scores
+for Mondrian random forests with the Australian weather data.
+Right: a debiased Mondrian random forest with $B=20$, giving $40$ trees
+in total. Three design points are identified for further analysis.}
+\label{fig:mondrian_weather_gcv}
+\end{figure}
+
+In Figure~\ref{fig:mondrian_weather_gcv} we show the mean squared error and GCV
+scores
+computed using \eqref{eq:mondrian_gcv} with $B=400$ trees
+for several candidate lifetime parameters $\lambda$. As
+expected, the mean squared error decreases monotonically
+as $\lambda$ increases and the model
+overfits, but the GCV score is minimized at a value which appropriately
+balances the bias and variance; we take $\lambda = 5$.
+We then fit a debiased Mondrian forest
+with bias correction order $J = 1$ as described in
+Section~\ref{sec:mondrian_debiased}, using $B=20$ trees at each debiasing level
+$r \in \{0, 1\}$ for a total of $40$ trees.
+We continue to use the same lifetime parameter
+$\lambda = 5$ selected through GCV without debiasing, following the approach
+recommended in Section~\ref{sec:mondrian_lifetime_selection} to ensure valid
+inference
+through negligible bias.
+The resulting debiased Mondrian random forest estimate is noticeably
+less smooth than the version without bias correction.
+This is expected due to both the inflated variance resulting from the debiasing
+procedure, and the undersmoothing enacted by selecting a lifetime parameter
+using GCV on the original estimator without debiasing.
+
+\begin{table}[t]
+\centering
+\begin{tabular}{|c|c|c|c|c|c|c|}
+\hline
+\multirow{2}{*}{Point}
+& \multirow{2}{*}{Humidity}
+& \multirow{2}{*}{Pressure}
+& \multicolumn{2}{|c|}{No debiasing, $J=0$}
+& \multicolumn{2}{|c|}{Debiasing, $J=1$} \\
+\cline{4-7}
+& &
+& $\hat\mu(x)$ & 95\% CI
+& $\hat\mu(x)$ & 95\% CI \\
+\hline
+$1$ & $20\%$ & $1020\,\textrm{mbar}$ &
+$\phantom{0}4.2\%$ &
+$3.9\%$ -- $4.5\%$ &
+$\phantom{0}2.0\%$ &
+$1.6\%$ -- $2.4\%$ \\
+$2$ & $70\%$ & $1000\,\textrm{mbar}$ &
+$52.6\%$ &
+$51.7\%$ -- $53.6\%$ &
+$59.8\%$ &
+$57.8\%$ -- $61.9\%$ \\
+$3$ & $80\%$ & $\phantom{1}990\,\textrm{mbar}$ &
+$78.1\%$ &
+$75.0\%$ -- $81.2\%$ &
+$93.2\%$ &
+$86.7\%$ -- $99.6\%$ \\
+\hline
+\end{tabular}
+\caption[Results for the Australian weather data]{
+Results for the Australian weather data
+at three specified design points.}
+\label{tab:mondrian_weather_ci}
+\end{table}
+
+Table~\ref{tab:mondrian_weather_ci} presents numerical results for estimation
+and
+inference at the three specified design points. We first give the outcomes
+without debiasing, using a Mondrian random forest with $B = 400$ trees and
+$\lambda = 5$ selected by GCV. We then show the results with a first-order
+($J=1$) debiased Mondrian random forest using $B = 200$ (again a total of
+$400$ trees) and the same value of $\lambda = 5$. The predicted chance of rain
+$\hat\mu(x)$ is found to vary substantially across different covariate values,
+and the resulting confidence intervals (CI) are generally narrow due to the
+large sample size and moderate lifetime parameter. The forest with debiasing
+exhibits more extreme predictions away from $50\%$ and wider confidence
+intervals in general, in line with the illustration in
+Figure~\ref{fig:mondrian_weather_gcv}. Interestingly, the confidence intervals
+for the
+non-debiased and debiased estimators do not intersect, indicating that the
+original estimator is severely biased, and providing further justification for
+our modified debiased random forest estimator.
+
+\section{Conclusion}%
+\label{sec:mondrian_conclusion}
+
+We gave a central limit theorem for the Mondrian random forest estimator
+and showed how to perform statistical inference on an unknown nonparametric
+regression function. We introduced debiased versions of the Mondrian random
+forest, and demonstrated their advantages
+for statistical inference and minimax-optimal estimation. We discussed
+tuning parameter selection, enabling a fully feasible and practical methodology.
+An application to weather forecasting was presented
+as an illustrative example. Implementations of this chapter's methodology and
+empirical results are provided by a Julia
+package at \github{wgunderwood/MondrianForests.jl}.
+This work is based on \citet{cattaneo2023inference}, and has been
+presented by Underwood at the University of Illinois Statistics Seminar (2024),
+the University of Michigan Statistics Seminar (2024), and the University of
+Pittsburgh Statistics Seminar (2024).
+
+\chapter{Dyadic Kernel Density Estimators}
+\label{ch:kernel}
+
+% abstract
+Dyadic data is often encountered when quantities of interest are associated
+with the edges of a network. As such, it plays an important role in statistics,
+econometrics, and many other data science disciplines. We consider the problem
+of uniformly estimating a dyadic Lebesgue density function, focusing on
+nonparametric kernel-based estimators taking the form of dyadic empirical
+processes. The main contributions of this chapter
+include the minimax-optimal uniform
+convergence rate of the dyadic kernel density estimator, along with strong
+approximation results for the associated standardized and Studentized
+$t$-processes. A consistent variance estimator enables the construction of
+valid and feasible uniform confidence bands for the unknown density function.
+We showcase the broad applicability of our results by developing novel
+counterfactual density estimation and inference methodology for dyadic data,
+which can be used for causal inference and program evaluation. A crucial
+feature of dyadic distributions is that they may be ``degenerate'' at certain
+points in the support of the data, a property making our analysis somewhat
+delicate. Nonetheless our methods for uniform inference remain robust to the
+potential presence of such points. For implementation purposes, we discuss
+inference procedures based on positive semi-definite covariance estimators,
+mean squared error optimal bandwidth selectors, and robust bias correction
+techniques. We illustrate the empirical finite-sample performance of our
+methods both in simulations and with real-world trade data, for which we make
+comparisons between observed and counterfactual trade distributions in
+different years. Our technical results concerning strong approximations and
+maximal inequalities are of potential independent interest.
+
+\section{Introduction}
+\label{sec:kernel_introduction}
+
+Dyadic data, also known as graphon data, plays an important role in the
+statistical, social, behavioral, and biomedical sciences. In network settings,
+this type of dependent data captures interactions between the units of study,
+and its analysis is of interest in statistics \citep{kolaczyk2009statistical},
+economics \citep{graham2020network}, psychology \citep{kenny2020dyadic}, public
+health \citep{luke2007network}, and many other data science disciplines. For
+$n \geq 2$, a dyadic data set contains $\frac{1}{2}n(n-1)$ observed real-valued
+random variables
+%
+\begin{align*}
+\bW_n = (W_{i j}:1\leq i<j \leq n),
+\quad\qquad W_{i j}
+&= W(A_i,A_j,V_{i j}),
+\end{align*}
+%
+where $W$ is an unknown function, $\bA_n=(A_{i}:1\leq i \leq n)$ are
+independent and identically distributed (i.i.d.)\ latent random variables, and
+$\bV_n=(V_{i j}:1\leq i<j \leq n)$ are i.i.d.\ latent random variables
+independent of $\bA_n$. A natural interpretation of this data is as a complete
+undirected network on $n$ vertices, with the latent variable $A_i$ associated
+with node $i$ and the observed variable $W_{i j}$ associated with the edge
+between nodes $i$ and $j$. The data generating process above is justified
+without loss of generality by the celebrated Aldous--Hoover representation
+theorem for exchangeable arrays
+\citep{aldous1981representations,hoover1979relations}.
+
+Various distributional features of dyadic data are of interest in applications.
+Most of the statistical literature focuses on parametric analysis, almost
+exclusively considering moments of (transformations of) the identically
+distributed $W_{i j}$. See \citet{davezies2021exchangeable},
+\citet{gao2021minimax}, and \citet{matsushita2021jackknife} for
+contemporary contributions and overviews. More recently, however, a few
+nonparametric procedures for dyadic data have been proposed in the literature
+\citep{graham2021minimax,graham2024kernel}.
+
+With the aim of estimating functions associated with $W_{i j}$
+using nonparametric kernel methods, we investigate the statistical
+properties of the class of local stochastic processes
+%
+\begin{align}\label{eq:kernel_estimator}
+w \mapsto \hat{f}_W(w)
+= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_h(W_{i j},w),
+\end{align}
+%
+where $k_h(s,w)$ is a kernel function that can change with the $n$-varying
+bandwidth parameter $h=h(n)$ and the evaluation point $w \in \cW\subseteq \R$.
+For each $w\in\cW$ and with an appropriate choice of the kernel function
+(e.g.\ $k_h(s,w)=K((s-w)/h)/h$ for an interior point $w$ of $\cW$ and a
+fixed symmetric integrable kernel function $K$), the statistic $\hat{f}_W(w)$
+becomes a kernel density estimator for the Lebesgue density function
+$f_W(w) = \E\big[f_{W \mid AA}(w \mid A_i,A_j)\big]$, where
+$f_{W \mid AA}(w \mid A_i,A_j)$ denotes the conditional Lebesgue density of
+$W_{i j}$ given $A_i$ and $A_j$. Setting $k_h(s,w)=K((s-w)/h)/h$,
+\citet{graham2024kernel} recently introduced the dyadic point estimator
+$\hat{f}_W(w)$ and studied its large sample properties pointwise in
+$w\in\cW=\mathbb{R}$, while \citet{chiang2020empirical} established its rate of
+convergence uniformly in $w\in\cW$ for a compact interval $\cW$ strictly
+contained in the support of the dyadic data $W_{i j}$.
+\citet{chiang2022inference} obtained a distributional approximation for the
+supremum statistic $\sup_{w\in\cW}\big|\hat{f}_W(w)\big|$ over a finite
+collection $\cW$ of design points. More generally, as we discuss below, the
+estimand $f_W(w)$ is useful in different applications because it forms the
+basis for counterfactual distributional analysis
+(Section~\ref{sec:kernel_counterfactual}) and other nonparametric and
+semiparametric
+methods (Section~\ref{sec:kernel_future}). While we assume throughout
+that the network is complete, our approach generalizes in a straightforward way
+to networks with missing edges, as in Section~\ref{sec:kernel_trade_data}.
+This can be
+seen by setting $W_{i j} = -\infty$ whenever the edge $\{i, j\}$ is not
+present, so that the law of $W_{i j}$ is a mixture between a continuous
+distribution and a point mass at $-\infty$. We apply our methodology to
+recover the continuous component of this distribution, following
+\citet{chiang2022inference}.
+
+We contribute to the emerging literature on nonparametric smoothing methods for
+dyadic data with two main technical results. Firstly, we derive the minimax
+rate of uniform convergence for density estimation with dyadic data, and show
+that the estimator $\hat{f}_W$ in \eqref{eq:kernel_estimator} is
+minimax-optimal under appropriate conditions. Secondly, we present a set of
+uniform distributional approximation results for the \emph{entire} stochastic
+process $\big(\hat{f}_W(w):w\in\cW\big)$. Furthermore, we illustrate the
+usefulness of our main results with two distinct substantive statistical
+applications:
+%
+\begin{inlineroman}
+\item
+confidence bands for $f_W$ (Section~\ref{sec:kernel_implementation}), and
+\item
+estimation and inference for counterfactual
+dyadic distributions (Section~\ref{sec:kernel_counterfactual}).
+\end{inlineroman}
+%
+Our main results also lay the foundation for studying the uniform
+distributional properties of other nonparametric and semiparametric tests and
+estimators based on dyadic data (Section~\ref{sec:kernel_future}). Importantly,
+our
+inference results cannot be deduced from the existing U-statistic, empirical
+process and U-process theory available in the literature
+\citep{van1996weak,gine2021mathematical} because, as explained in detail below,
+$\hat{f}_W(w)$ is not a standard U-statistic, nor is
+(a suitable rescaling of) the stochastic process
+$\hat{f}_W$ Donsker in general, and the underlying dyadic data $\bW_n$ exhibits
+statistical dependence due to its network structure.
+
+Section~\ref{sec:kernel_setup} outlines the setup and presents the main
+assumptions imposed throughout this chapter. We demonstrate in
+Theorem~\ref{thm:kernel_bias} how the smoothing bias of the dyadic kernel
+density estimator can be controlled, and then discuss a Hoeffding-type
+decomposition of the U-statistic-like $\hat{f}_W$
+in Lemma~\ref{lem:kernel_hoeffding}. This is more general than
+the standard Hoeffding decomposition for second-order U-statistics due to the
+intrinsic dyadic data structure. In particular, \eqref{eq:kernel_hoeffding}
+shows that $\hat{f}_W(w)$ decomposes into a sum of the four terms $B_n(w)$,
+$L_n(w)$, $E_n(w)$, and $Q_n(w)$, where $E_n(w)$ is not present in the classical
+second-order U-statistic theory. The first term $B_n(w)$ captures the usual
+smoothing bias, the second term $L_n(w)$ is akin to the H{\'a}jek projection
+for second-order U-statistics, the third term $E_n(w)$ is a mean-zero double
+average of conditionally independent terms, and the fourth term $Q_n(w)$ is a
+negligible totally degenerate second-order U-process. The leading stochastic
+fluctuations of the process $\hat{f}_W$ are captured by $L_n$ and $E_n$, both
+of which are known to be asymptotically distributed as Gaussian random
+variables pointwise in $w\in\cW$ \citep{graham2024kernel}. However, the
+H{\'a}jek projection term $L_n$ will often be ``degenerate'' at some or
+possibly all evaluation points $w\in\cW$.
+The three possible types of degeneracy are detailed in
+Lemma~\ref{lem:kernel_trichotomy},
+and we establish bounds in probability for each term in the Hoeffding-type
+decomposition in Lemma~\ref{lem:kernel_uniform_concentration}.
+We give an example of a simple family of dyadic distributions
+exhibiting all three degeneracy types.
+
+Section~\ref{sec:kernel_point_estimation} studies minimax convergence rates for
+point
+estimation of $f_W$ uniformly over $\cW$ and gives precise conditions under
+which the estimator $\hat{f}_W$ is minimax-optimal. Firstly, in
+Theorem~\ref{thm:kernel_uniform_consistency} we establish the uniform rate of
+convergence of $\hat{f}_W$ for $f_W$. This result improves upon the recent
+paper of \citet{chiang2020empirical} by allowing for compactly supported dyadic
+data and generic kernel-like functions $k_h$ (including boundary-adaptive
+kernels), while also explicitly accounting for possible degeneracy of the
+H\'{a}jek projection term $L_n$ at some or possibly all points $w\in\cW$.
+Secondly, in Theorem~\ref{thm:kernel_minimax} we derive the minimax uniform
+convergence rate for estimating $f_W$, again allowing for possible degeneracy,
+and verify that it is achieved by $\hat f_W$. This result appears to be new to
+the literature, complementing recent work on parametric moment estimation using
+graphon data \citep{gao2021minimax} and on nonparametric kernel-based
+regression using dyadic data \citep{graham2021minimax}.
+
+Section~\ref{sec:kernel_inference} presents a distributional analysis of the
+stochastic process $\hat{f}_W$ uniformly in $w \in \cW$. Because the
+$t$-process based on $\hat{f}_W$ is
+not asymptotically tight in general, it does not converge weakly in the space
+of uniformly bounded real functions supported on $\cW$ and equipped with the
+uniform norm \citep{van1996weak}, and hence is non-Donsker. To circumvent this
+problem, we employ strong approximation methods to characterize its
+distributional properties. Up to the smoothing bias term $B_n$ and the
+negligible term $Q_n$, it suffices to consider the stochastic process
+$w \mapsto L_n(w)+E_n(w)$. Since $L_n$ can be degenerate at some or possibly all
+points $w\in\cW$, and also because under some bandwidth choices both $L_n$ and
+$E_n$ can be of comparable order, it is crucial to analyze the joint
+distributional properties of $L_n$ and $E_n$. To do so, we employ a carefully
+crafted conditioning approach where we first establish an unconditional strong
+approximation for $L_n$ and a conditional-on-$\bA_n$ strong approximation for
+$E_n$. We then combine these to obtain a strong approximation for $L_n+E_n$.
+
+The stochastic process $L_n$ is an empirical process indexed by an $n$-varying
+class of functions depending only on the i.i.d.\ random variables $\bA_n$. Thus
+we use the celebrated Hungarian construction \citep{komlos1975approximation},
+building on ideas in \citet{gine2004kernel} and \citet{gine2010confidence}. The
+resulting rate of strong approximation is optimal, and follows from a generic
+strong approximation result of potential independent interest given in
+Section~\ref{sec:kernel_app_technical}. Our main result for $L_n$ is given as
+Lemma~\ref{lem:kernel_strong_approx_Ln}, and makes explicit the potential
+presence of
+degenerate points.
+
+The stochastic process $E_n$ is an empirical process depending on the dyadic
+variables $W_{i j}$ and indexed by an $n$-varying class of functions. When
+conditioning on $\bA_n$, the variables $W_{i j}$ are independent but not
+necessarily identically distributed (i.n.i.d.), and thus we establish a
+conditional-on-$\bA_n$ strong approximation for $E_n$ based on the Yurinskii
+coupling \citep{yurinskii1978error}, leveraging a refinement obtained by
+\citet*[Lemma~38]{belloni2019conditional}. This result follows from a generic
+strong approximation result which gives a novel rate of strong approximation
+for (local) empirical processes based on i.n.i.d. data, given in
+Section~\ref{sec:kernel_app_technical}.
+Lemma~\ref{lem:kernel_conditional_strong_approx_En} gives our conditional strong
+approximation for $E_n$.
+
+Once the unconditional strong approximation for $L_n$ and the
+conditional-on-$\bA_n$ strong approximation for $E_n$ are established, we show
+how to properly ``glue'' them together to deduce a final unconditional strong
+approximation for $L_n+E_n$ and hence also for $\hat{f}_W$ and its associated
+$t$-process. This final step requires some additional technical work. Firstly,
+building on our conditional strong approximation for $E_n$, we establish an
+unconditional strong approximation for $E_n$ in
+Lemma~\ref{lem:kernel_unconditional_strong_approx_En}.
+We then employ a generalization
+of the celebrated Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform},
+given in given in Section~\ref{sec:kernel_app_technical}, to deduce a
+\emph{joint}
+strong approximation for $(L_n,E_n)$ and, in particular, for $L_n+E_n$. Thus we
+obtain our main result in Theorem~\ref{thm:kernel_strong_approx_Tn},
+which establishes
+a valid strong approximation for the $t$-process associated with $\hat{f}_W$.
+This uniform inference result complements the recent contribution of
+\citet{davezies2021exchangeable}, which is not applicable here as
+the $t$-process is non-Donsker.
+
+We illustrate the applicability of our strong approximation results for
+$\hat{f}_W$ and its associated $t$-process by constructing valid standardized
+uniform confidence bands for the unknown density function $f_W$
+in Theorem~\ref{thm:kernel_infeasible_ucb}. Instead of
+relying on extreme value theory \citep*[as in][]{gine2004kernel}, we employ
+anti-concentration methods, following \citet{chernozhukov2014anti}. This
+illustration improves on the recent work of \citet{chiang2022inference}, which
+obtained simultaneous confidence intervals for the dyadic density $f_W$ based
+on a high-dimensional central limit theorem over rectangles, following prior
+work by \citet{chernozhukov2017central}. The distributional
+approximation therein is applied to the H\'{a}jek projection term $L_n$ only,
+whereas our main construction leading to
+Theorem~\ref{thm:kernel_strong_approx_Tn}
+gives a strong approximation for the entire U-process-like $\hat{f}_W$ and its
+associated $t$-process, uniformly on $\cW$. As a consequence, our uniform
+inference theory is robust to potential unknown degeneracies in $L_n$ by virtue
+of our strong approximation for $L_n+E_n$ and the use of proper standardization,
+delivering a ``rate-adaptive'' inference procedure. Our result appears to be
+the first to provide confidence bands that are valid uniformly over $w \in \cW$
+rather than merely over a finite collection of design points. Moreover, they
+provide distributional approximations for the whole $t$-statistic process,
+which can be useful in applications where functionals other than the supremum
+are of interest.
+
+Section~\ref{sec:kernel_implementation} addresses outstanding issues of
+implementation. Firstly, we discuss estimation of the covariance function of
+the Gaussian process underlying our strong approximation results. We present
+two estimators, one based on a plug-in method, and the other on a
+positive semi-definite regularization thereof \citep{laurent2005semidefinite}.
+We derive the uniform convergence rates for both estimators in
+Lemma~\ref{lem:kernel_sdp}, which we then use to justify Studentization
+of $\hat{f}_W$
+and a feasible simulation-based approximation of the infeasible Gaussian
+process underlying our strong approximation results. Secondly, we discuss
+integrated mean squared error (IMSE) bandwidth selection and provide a simple
+rule-of-thumb implementation for applications
+\citep{wand1994kernel,simonoff1996smoothing}. Thirdly, we provide feasible,
+valid uniform inference methods for $f_W$ by employing robust bias correction
+\citep{calonico2018effect, calonico2022coverage}.
+Algorithm~\ref{alg:kernel_method}
+summarizes our entire feasible uniform inference methodology.
+
+Section~\ref{sec:kernel_simulations} reports empirical evidence for our proposed
+feasible robust bias-corrected confidence bands for $f_W$. We use simulations
+to show that these confidence bands are robust to potential unknown degenerate
+points in the underlying dyadic distribution.
+
+Section~\ref{sec:kernel_counterfactual} presents novel results for
+counterfactual
+dyadic density estimation and inference, offering an application of our general
+theory to a substantive problem in statistics and other data science
+disciplines. Counterfactual distributions are important for causal inference
+and policy evaluation
+\citep{dinardo1996distribution,chernozhukov2013inference}, and in the context
+of network data, such analysis can be used to answer empirical questions such
+as ``what would the international trade distribution have been if
+the gross domestic product (GDP) of the countries had remained the same as in a
+previous year?'' We formally show how our theory for kernel-based dyadic
+estimators can be used to infer the counterfactual density function of dyadic
+data had some monadic covariates followed a different distribution. We propose
+a two-step semiparametric reweighting approach in which we first estimate the
+Radon--Nikodym derivative between the observed and counterfactual covariate
+distributions using a simple parametric estimator, and then use this to
+construct a weighted dyadic kernel density estimator. We present uniform
+consistency, strong approximation, and feasible inference results for this
+dyadic counterfactual density estimator. Finally, we illustrate our
+methods with a real dyadic data set recording bilateral trade between
+countries from 1995 to 2005, using GDP as a covariate for the
+counterfactual analysis.
+
+Section~\ref{sec:kernel_future} discusses further statistical applications
+of our main
+results, including dyadic density testing and nonparametric and
+semiparametric dyadic regression. Section~\ref{sec:kernel_conclusion} concludes.
+Appendix~\ref{app:kernel} includes other technical and methodological results,
+proofs, and additional details omitted here to conserve space.
+Section~\ref{sec:kernel_app_technical} may be of independent interest,
+containing
+two generic strong approximation theorems for empirical processes, a
+generalized Vorob'ev--Berkes--Philipp theorem, and a maximal inequality for
+i.n.i.d.\ random variables.
+
+\subsection{Notation}
+
+The total variation norm of a
+real-valued function $g$ of a single real variable is written as
+$\|g\|_\TV = \sup_{n \geq 1} \sup_{x_1 \leq \cdots \leq x_n}
+\sum_{i=1}^{n-1} |g(x_{i+1}) - g(x_i)|$.
+For an integer $m\geq 0$, denote by $\mathcal{C}^m(\mathcal{X})$
+the space of all functions from $\R$ to $\R$
+which are $m$ times continuously differentiable on
+a subset $\mathcal{X} \subseteq \R$.
+For $C>0$, define the H\"{o}lder class with smoothness parameter
+$\beta > 0$ to be
+$\cH^\beta_C(\cX) =
+\big\{
+g \in \cC^{\flbeta}(\cX) \! : \!
+\max_{1 \leq r \leq \flbeta}
+\big| g^{(r)}(x) \big| \leq C,
+\big| g^{(\flbeta)}(x) - g^{(\flbeta)}(x') \big|
+\leq C |x-x'|^{\beta - \flbeta},
+\forall x, x' \in \cX
+\big\}$,
+where $\flbeta$ denotes the largest integer which is strictly less than $\beta$.
+Note that $\cH^1_C(\cX)$ is the class of $C$-Lipschitz functions on $\cX$.
+For $a \in \R$ and $b \geq 0$, we write $[a \pm b]$ for the interval
+$[a-b, a+b]$. For non-negative sequences $a_n$ and $b_n$, write
+$a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that
+$a_n / b_n$ is bounded for $n\geq 1$.
+Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$.
+If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$.
+For random non-negative sequences $A_n$ and $B_n$, write
+$A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if
+$A_n / B_n$ is bounded in probability.
+Write $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability.
+For $a,b \in \R$, define $a\wedge b=\min\{a,b\}$ and $a \vee b = \max\{a,b\}$.
+
+\section{Setup}\label{sec:kernel_setup}
+
+We impose the following two assumptions throughout this chapter,
+which concern firstly the dyadic data generating process, and
+secondly the choice of kernel and bandwidth sequence.
+
+%
+\begin{assumption}[Data generation]
+\label{ass:kernel_data}
+%
+% A and V variables
+Let $\bA_n = (A_i: 1 \leq i \leq n)$ be i.i.d.\ random variables supported on
+$\cA \subseteq \R$ and let $\bV_n = (V_{i j}: 1 \leq i < j \leq n)$ be
+i.i.d.\ random variables with a Lebesgue density $f_V$ on $\R$, with $\bA_n$
+independent of $\bV_n$.
+%
+% W variables
+Let $W_{i j} = W(A_i, A_j, V_{i j})$ and
+$\bW_n = (W_{i j}: 1 \leq i < j \leq n)$, where $W$ is an unknown real-valued
+function which is symmetric in its first two arguments.
+%
+Let $\cW \subseteq \R$ be a compact interval with positive Lebesgue measure
+$\Leb(\cW)$. The conditional distribution of $W_{i j}$ given $A_i$ and $A_j$
+admits a Lebesgue density $f_{W \mid AA}(w \mid A_i, A_j)$.
+For $C_\rH > 0$ and $\beta \geq 1$, take $f_W \in \cH^\beta_{C_\rH}(\cW)$
+where $f_{W}(w) = \E\left[f_{W \mid AA}(w \mid A_i,A_j)\right]$ and
+$f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$
+for all $a,a' \in \cA$. Suppose
+$\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV <\infty$ where
+$f_{W \mid A}(w \mid a) = \E\left[f_{W \mid AA}(w \mid A_i,a)\right]$.
+%
+\end{assumption}
+
+In Assumption~\ref{ass:kernel_data} we require the density $f_W$ be in a
+$\beta$-smooth H\"older class of functions on the compact interval $\cW$.
+H\"older classes are well established in the minimax estimation literature
+\citep{stone1982optimal,gine2021mathematical},
+with the smoothness parameter $\beta$ appearing
+in the minimax-optimal rate of convergence. If the H\"older condition is
+satisfied only piecewise, then our results remain valid provided that the
+boundaries between the pieces are known and treated as boundary points.
+
+If $W(a_1, a_2, v)$ is strictly monotonic and continuously differentiable in
+its third argument, we can give the conditional density of $W_{i j}$ explicitly
+using the usual change-of-variables formula: with $w=W(a_1,a_2,v)$, we have
+$f_{W \mid AA}(w \mid a_1,a_2)
+= f_V(v) \big|\partial W(a_1,a_2,v)/\partial v\big|^{-1}$.
+
+\begin{assumption}[Kernels and bandwidth]
+\label{ass:kernel_bandwidth}%
+%
+Let $h = h(n) > 0$ be a sequence of bandwidths satisfying $h \log n \to 0$
+and $\frac{\log n}{n^2h} \to 0$. For each $w \in \cW$, let $k_h(\cdot, w)$ be
+a real-valued function supported on $[w \pm h] \cap \cW$. For an integer
+$p \geq 1$, let $k_h$ belong to a family of boundary bias-corrected kernels
+of order $p$, i.e.,
+%
+\begin{align*}
+\int_{\cW}
+(s-w)^r k_h(s,w) \diff{s}
+\quad
+\begin{cases}
+\begin{alignedat}{2}
+&= 1 &\qquad &\text{for all } w \in \cW \text{ if }\, r = 0, \\
+&= 0 & &\text{for all } w \in \cW \text{ if }\, 1 \leq r \leq p-1, \\
+&\neq 0 & &\text{for some } w \in \cW \text{ if }\, r = p.
+\end{alignedat}
+\end{cases}
+\end{align*}
+%
+Also, for $C_\rL > 0$,
+suppose $k_h(s, \cdot) \in \cH^1_{C_\rL h^{-2}}(\cW)$
+for all $s \in \cW$.
+%
+\end{assumption}
+
+This assumption allows for all standard compactly supported and possibly
+boundary-corrected kernel functions
+\citep{wand1994kernel,simonoff1996smoothing}, constructed for example by taking
+polynomials on a compact interval and solving a linear system for the
+coefficients. Assumption~\ref{ass:kernel_bandwidth} implies
+(see Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
+in Appendix~\ref{app:kernel})
+that if $h \leq 1$ then $k_h$ is uniformly bounded by
+$C_\rk h^{-1}$ where $C_\rk \vcentcolon = 2 C_\rL + 1 + 1/\Leb(\cW)$.
+
+\subsection{Bias characterization}
+\label{sec:kernel_bias}
+
+We begin by characterizing and bounding the bias
+$B_n(w) = \E \big[ \hat f_W(w) \big] - f_W(w)$.
+Theorem~\ref{thm:kernel_bias} is a standard result for the non-random smoothing
+bias in kernel density estimation with higher-order kernels and boundary bias
+correction, and does not rely on the dyadic structure.
+
+\begin{theorem}[Bias bound]
+\label{thm:kernel_bias}
+
+Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
+hold. For $w \in \cW$ define the leading bias term as
+%
+\begin{align*}
+b_p(w)
+&=
+\frac{f_W^{(p)}(w)}{p!}
+\int_{\cW}
+k_h(s,w)
+\left(
+\frac{s-w}{h}
+\right)^p
+\diff{s}.
+\end{align*}
+%
+for $1 \leq p \leq \flbeta$. Then we have the following bias bounds.
+%
+\begin{enumerate}[label=(\roman*)]
+\item If $p \leq \flbeta - 1$, then
+$\sup_{w \in \cW} | B_n(w) - h^p b_p(w) |
+\leq \frac{2 C_\rk C_\rH}{(p+1)!} h^{p+1}$.
+
+\item If $p = \flbeta$, then
+$\sup_{w \in \cW} | B_n(w) - h^p b_p(w) |
+\leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$.
+
+\item If $p \geq \flbeta+1$, then
+$\sup_{w \in \cW} | B_n(w) |
+\leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$.
+\end{enumerate}
+%
+Noting that $\sup_{\cW} |b_p(w)| \leq 2 C_\rk C_\rH / p!$,
+we deduce that for $h \leq 1$,
+%
+\begin{align*}
+\sup_{w \in \cW} | B_n(w) |
+\leq
+\frac{4 C_\rk C_\rH}{(p \wedge \flbeta)!}
+h^{p \wedge \beta}
+\lesssim
+h^{p \wedge \beta}.
+\end{align*}
+
+\end{theorem}
+
+\subsection{Hoeffding-type decomposition and degeneracy}
+\label{sec:kernel_degeneracy}
+
+Our next step is to consider the stochastic part
+$\hat f_W(w) - \E \big[ \hat f_W(w) \big]$
+of the classical bias--variance decomposition. This term is akin to a
+U-statistic and thus admits a Hoeffding-type decomposition, presented in
+Lemma~\ref{lem:kernel_hoeffding}, which is a key element in our analysis.
+
+\begin{lemma}[Hoeffding-type decomposition for $\hat f_W$]
+\label{lem:kernel_hoeffding}
+
+Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
+hold. Define the linear, quadratic, and error terms
+%
+\begin{align*}
+L_n(w)
+&=
+\frac{2}{n} \sum_{i=1}^n l_i(w),
+&Q_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} q_{i j}(w), \\
+E_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} e_{i j}(w)
+\end{align*}
+%
+respectively, where
+%
+\begin{align*}
+l_i(w)
+&=
+\E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w)\right], \\
+q_{i j}(w)
+&=
+\E\left[k_h(W_{i j},w) \mid A_i, A_j\right]
+- \E\left[k_h(W_{i j},w) \mid A_i\right]
+- \E\left[k_h(W_{i j},w) \mid A_j\right]
++ \E\left[k_h(W_{i j},w)\right], \\
+e_{i j}(w)
+&=
+k_h(W_{i j},w) - \E\left[k_h(W_{i j},w) \mid A_i, A_j\right].
+\end{align*}
+%
+Then, recalling the bias term $B_n$ from Section~\ref{sec:kernel_bias},
+we have the Hoeffding-type decomposition
+%
+\begin{align}
+\label{eq:kernel_hoeffding}
+\hat f_W(w) - f_W(w) = L_n(w) + Q_n(w) + E_n(w) + B_n(w).
+\end{align}
+%
+The processes $L_n$, $Q_n$, and $E_n$ are mean-zero
+with $\E\big[L_n(w)\big] = \E\big[Q_n(w)\big] = \E\big[E_n(w)\big] = 0$
+for all $w \in \cW$. They are also orthogonal,
+satisfying $\E\big[ L_n(w) Q_n(w') \big] = \E\big[ L_n(w) E_n(w') \big]
+= \E\big[ Q_n(w) E_n(w') \big] = 0$ for all $w, w' \in \cW$.
+%
+\end{lemma}
+
+The process $L_n$ is the H{\'a}jek projection of a U-process,
+which can exhibit degeneracy if $\Var[L_n(w)] = 0$ at some
+or all points $w \in \cW$. To characterize the different possible
+degeneracy types in Lemma~\ref{lem:kernel_trichotomy},
+we first introduce the following lower and upper degeneracy constants:
+%
+\begin{align*}
+\Dl^2 := \inf_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right]
+\qquad \text{ and } \qquad
+\Du^2 := \sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right].
+\end{align*}
+%
+\begin{lemma}[Trichotomy of degeneracy]%
+\label{lem:kernel_trichotomy}%
+%
+Grant Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}.
+Then the type of degeneracy exhibited by $\hat f_W(w)$
+is precisely one of the following three possibilities.
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item Total degeneracy:
+$\Du = \Dl = 0$. Then $L_n(w) = 0$ for all $w \in \cW$ almost surely.
+
+\item No degeneracy:
+$\Dl > 0$. Then $\inf_{w \in \cW} \Var[L_n(w)] \geq \frac{2 \Dl}{n}$
+for all large enough n.
+
+\item Partial degeneracy:
+$\Du > \Dl = 0$. There exists $w \in \cW$ with
+$\Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$;
+such a point is labeled \emph{degenerate} and satisfies
+$\Var[L_n(w)] \leq 64 C_\rk C_\rH C_\rd \frac{h}{n}$.
+There is also a point $w' \in \cW$ with
+$\Var\left[f_{W \mid A}(w' \mid A_i)\right] > 0$;
+such a point is labeled \emph{non-degenerate} and satisfies
+$\Var[L_n(w')] \geq
+\frac{2}{n} \Var\left[f_{W \mid A}(w' \mid A_i)\right]$
+for all large enough $n$.
+
+\end{enumerate}
+
+\end{lemma}
+
+The following lemma describes the uniform stochastic order of the different
+terms in the Hoeffding-type decomposition, explicitly accounting for potential
+degeneracy.
+
+\begin{lemma}[Uniform concentration]
+\label{lem:kernel_uniform_concentration}
+
+Suppose Assumptions \ref{ass:kernel_data} and
+\ref{ass:kernel_bandwidth} hold. Then
+%
+\begin{align*}
+\E\left[ \sup_{w \in \cW} |L_n(w)| \right]
+&\lesssim \frac{\Du}{\sqrt n},
+&\E\left[ \sup_{w \in \cW} |Q_n(w)| \right]
+&\lesssim \frac{1}{n},
+&\E\left[ \sup_{w \in \cW} |E_n(w)| \right]
+&\lesssim \sqrt{\frac{\log n}{n^2h}}.
+\end{align*}
+\end{lemma}
+
+Lemma~\ref{lem:kernel_uniform_concentration} captures the potential total
+degeneracy
+of $L_n$ by illustrating how if $\Du=0$ then $L_n=0$ everywhere on $\cW$ almost
+surely. The following lemma captures the potential partial degeneracy of $L_n$,
+where $\Du > \Dl = 0$. For $w,w' \in \cW$, define the covariance function
+%
+\begin{align*}
+\Sigma_n(w,w') =
+\E\Big[
+\Big(
+\hat f_W(w)
+- \E\big[\hat f_W(w)\big]
+\Big)
+\Big(
+\hat f_W(w')
+- \E\big[\hat f_W(w')\big]
+\Big)
+\Big].
+\end{align*}
+%
+\begin{lemma}[Variance bounds]
+\label{lem:kernel_variance_bounds}
+Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
+hold. Then for sufficiently large $n$,
+%
+\begin{align*}
+\frac{\Dl^2}{n} + \frac{1}{n^2h}
+\inf_{w \in \cW} f_W(w)
+&\lesssim
+\inf_{w \in \cW} \Sigma_n(w,w)
+\leq
+\sup_{w \in \cW} \Sigma_n(w,w)
+\lesssim
+\frac{\Du^2}{n} + \frac{1}{n^2h}.
+\end{align*}
+%
+\end{lemma}
+
+As a simple example of the different types of degeneracy, consider the family
+of dyadic distributions $\P_{\pi}$ indexed by $\pi = (\pi_1, \pi_2, \pi_3)$
+with $\sum_{i=1}^3 \pi_i = 1$ and $\pi_i \geq 0$, generated by
+$W_{i j} = A_i A_j + V_{i j}$, where $A_i$ equals $-1$ with probability
+$\pi_1$, equals $0$ with probability $\pi_2$ and equals $+1$ with probability
+$\pi_3$, and $V_{i j}$ is standard Gaussian. This model induces a latent
+``community structure'' where community membership is determined by the value
+of $A_i$ for each node $i$, and the interaction outcome $W_{i j}$ is a function
+only of the communities which $i$ and $j$ belong to and some idiosyncratic
+noise. Unlike the stochastic block model \citep{kolaczyk2009statistical}, our
+setup assumes that community membership has no impact on edge existence, as we
+work with fully connected networks; see Section~\ref{sec:kernel_trade_data} for
+a
+discussion of how to handle missing edges in practice. Also note that the
+parameter of interest in this chapter is the Lebesgue density of a continuous
+random variable $W_{i j}$ rather than the probability of network edge
+existence, which is the focus of the graphon estimation literature
+\citep{gao2021minimax}.
+
+In line with Assumption~\ref{ass:kernel_data}, $\bA_n$ and $\bV_n$ are i.i.d.\
+sequences independent of each other. Then
+$f_{W \mid AA}(w \mid A_i, A_j) = \phi(w - A_i A_j)$,\,
+$f_{W \mid A}(w \mid A_i) = \pi_1 \phi(w + A_i) + \pi_2 \phi(w)
++ \pi_3 \phi(w - A_i)$, and
+$f_W(w) = (\pi_1^2 + \pi_3^2) \phi(w-1) + \pi_2 (2 - \pi_2) \phi(w) + 2
+\pi_1 \pi_3 \phi(w+1),$
+where $\phi$ denotes the probability density function of the standard normal
+distribution. Note that $f_W(w)$ is strictly positive for all $w \in \R$.
+Consider the parameter choices:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$:\quad
+$\P_\pi$ is degenerate at all $w \in \R$,
+
+\item $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$:\quad
+$\P_\pi$ is degenerate only at $w=0$,
+
+\item $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$:\quad
+$\P_\pi$ is non-degenerate for all $w \in \R$.
+
+\end{enumerate}
+%
+Figure~\ref{fig:kernel_distribution} demonstrates these phenomena, plotting the
+density $f_W$ and the standard deviation of the conditional
+density $f_{W|A}$ over $\cW = [-2,2]$ for each choice of the parameter $\pi$.
+
+The trichotomy of total/partial/no degeneracy is useful for understanding the
+distributional properties of the dyadic kernel density estimator
+$\hat{f}_W(w)$. Crucially, our need for uniformity in $w$ complicates the
+simpler degeneracy/no degeneracy dichotomy observed previously in the
+literature \citep{graham2024kernel}. From a pointwise-in-$w$
+perspective, partial degeneracy causes no issues, while it is a fundamental
+problem when conducting inference uniformly over $w \in \cW$. We develop
+methods that are valid regardless of the presence of partial or total
+degeneracy.
+
+\begin{figure}[t]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/distribution_plot_total.pdf}
+\caption{Total degeneracy, \\
+$\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/distribution_plot_partial.pdf}
+\caption{Partial degeneracy, \\
+$\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/distribution_plot_none.pdf}
+\caption{No degeneracy, \\
+$\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.}
+\end{subfigure}
+%
+\caption[The family of distributions $\P_\pi$]{
+Density $f_W$ and standard deviation
+of $f_{W|A}$ for the family of distributions $\P_\pi$.}
+%
+\label{fig:kernel_distribution}
+\end{figure}
+
+\section{Point estimation results}
+\label{sec:kernel_point_estimation}
+
+Using the bias bound from Theorem~\ref{thm:kernel_bias} and
+the concentration results from Lemma~\ref{lem:kernel_uniform_concentration},
+the next theorem establishes an upper bound on the uniform convergence rate of
+$\hat f_W$.
+%
+\begin{theorem}[Uniform convergence rate]%
+\label{thm:kernel_uniform_consistency}%
+Suppose that Assumptions \ref{ass:kernel_data} and
+\ref{ass:kernel_bandwidth} hold. Then
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big|\hat{f}_W(w) - f_W(w)\big|
+\right]
+\lesssim
+h^{p\wedge\beta} + \frac{\Du}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}.
+\end{align*}
+\end{theorem}
+%
+The implicit constant in Theorem~\ref{thm:kernel_uniform_consistency} depends
+only on
+$\cW$, $\beta$, $C_\rH$, and the choice of kernel. We interpret this result in
+light of the degeneracy trichotomy from Lemma~\ref{lem:kernel_trichotomy}.
+These results generalize \citet*[Theorem~1]{chiang2020empirical}
+by allowing for compactly supported data and more general kernels
+$k_h(\cdot,w)$, enabling boundary-adaptive estimation.
+
+%
+\begin{enumerate}[label=(\roman*)]
+\item Partial or no degeneracy: $\Du > 0$.
+Any bandwidths satisfying
+$n^{-1} \log n \lesssim h \lesssim n^{-\frac{1}{2(p\wedge\beta)}}$ yield
+$\E\big[\sup_{w \in \cW}\big|\hat f_W(w)
+- f_W(w)\big| \big] \lesssim \frac{1}{\sqrt n}$, the ``parametric''
+bandwidth-independent rate noted by \citet{graham2024kernel}.
+
+\item Total degeneracy: $\Du = 0$.
+Minimizing the bound in Theorem~\ref{thm:kernel_uniform_consistency} with
+$h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2(p\wedge\beta)+1}}$
+yields $\E\big[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \big]
+\lesssim
+\big(\frac{\log n}{n^2} \big)^{\frac{p\wedge\beta}{2(p\wedge\beta)+1}}$.
+\end{enumerate}
+
+\subsection{Minimax optimality}
+
+We establish the minimax rate under the supremum norm for density estimation
+with dyadic data. This implies minimax optimality of the kernel density
+estimator $\hat f_W$, regardless of the degeneracy type of the dyadic
+distribution.
+
+\begin{theorem}[Uniform minimax optimality]
+\label{thm:kernel_minimax}
+
+Fix $\beta \geq 1$ and $C_\rH > 0$, and take $\cW$ a compact interval with
+positive Lebesgue measure. Define $\cP = \cP(\cW, \beta, C_\rH)$ as the class
+of dyadic distributions satisfying Assumption~\ref{ass:kernel_data}. Define
+$\cP_\rd$ as the subclass of $\cP$ containing only those distributions which
+are totally degenerate on $\cW$ in the sense that
+$\sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$. Then
+%
+\begin{align*}
+\inf_{\tilde f_W}
+\sup_{\P \in \cP}
+\E_\P\left[
+\sup_{w \in \cW}
+\big| \tilde f_W(w) - f_W(w) \big|
+\right]
+&\asymp
+\frac{1}{\sqrt n}, \\
+\inf_{\tilde f_W}
+\sup_{\P \in \cP_\rd}
+\E_\P\left[
+\sup_{w \in \cW}
+\big| \tilde f_W(w) - f_W(w) \big|
+\right]
+&\asymp
+\left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}},
+\end{align*}
+%
+where $\tilde f_W$ is any estimator depending only on the data
+$\bW_n = (W_{i j}: 1 \leq i < j \leq n)$ distributed according to the dyadic
+law $\P$. The constants in $\asymp$ depend only on
+$\cW$, $\beta$, and $C_\rH$.
+
+\end{theorem}
+
+Theorem~\ref{thm:kernel_minimax} shows that the uniform convergence rate of
+$n^{-1/2}$ obtained in Theorem~\ref{thm:kernel_uniform_consistency}
+(coming from the $L_n$ term) is minimax-optimal in general.
+When attention is restricted to totally degenerate dyadic distributions,
+$\hat f_W$ also achieves the minimax rate of uniform convergence
+(assuming a kernel of sufficiently high order $p \geq \beta$),
+which is on the order of
+$\left(\frac{\log n}{n^2}\right)^{\frac{\beta}{2\beta+1}}$ and
+is determined by the bias $B_n$ and the leading variance term $E_n$ in
+\eqref{eq:kernel_hoeffding}.
+
+Combining Theorems
+\ref{thm:kernel_uniform_consistency}~and~\ref{thm:kernel_minimax},
+we conclude that $\hat{f}_W(w)$ achieves the minimax-optimal rate for uniformly
+estimating $f_W(w)$ if $h \asymp \left( \frac{\log n}{n^2}
+\right)^{\frac{1}{2\beta+1}}$ and a kernel of sufficiently high order
+($p \geq \beta$) is used, whether or not there are any degenerate points in the
+underlying data generating process. This result appears to be new to the
+literature on nonparametric estimation with dyadic data. See
+\citet{gao2021minimax} for a contemporaneous review.
+
+\section{Distributional results}
+\label{sec:kernel_inference}
+
+We investigate the distributional properties of the
+standardized $t$-statistic process
+%
+\begin{align*}
+T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\Sigma_n(w,w)}},
+\end{align*}
+%
+which is not necessarily asymptotically tight. Therefore, to approximate the
+distribution of the entire $t$-statistic process, as well as specific
+functionals thereof, we rely on a novel strong approximation approach outlined
+in this section. Our results can be used to perform valid uniform inference
+irrespective of the degeneracy type.
+
+This section is largely concerned with distributional properties and thus
+frequently requires copies of stochastic processes. For succinctness of
+notation, we will not differentiate between a process and its copy, but details
+are available in Section~\ref{sec:kernel_app_technical}.
+
+\subsection{Strong approximation}
+
+By the Hoeffding-type decomposition \eqref{eq:kernel_hoeffding} and
+Lemma~\ref{lem:kernel_uniform_concentration}, it suffices to consider the
+distributional properties of the stochastic process $L_n + E_n$.
+Our approach combines the K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation
+\citep{komlos1975approximation} to obtain a strong approximation of $L_n$ with
+a Yurinskii approximation \citep{yurinskii1978error} to obtain a
+\emph{conditional} (on $\bA_n$) strong approximation of $E_n$. The latter is
+necessary because $E_n$ is akin to a local empirical process of i.n.i.d.\
+random variables, conditional on $\bA_n$, and therefore the KMT approximation
+is not applicable. These approximations are then combined to give a final
+(unconditional) strong approximation for $L_n+E_n$, and thus for the
+$t$-statistic process $T_n$.
+
+The following lemma is an application of our generic KMT approximation result
+for empirical processes, given in Section~\ref{sec:kernel_app_technical}, which
+builds on earlier work by \citet{gine2004kernel} and \citet{gine2010confidence}
+and may be of independent interest.
+
+\begin{lemma}[Strong approximation of $L_n$]
+\label{lem:kernel_strong_approx_Ln}
+%
+Suppose that Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth}
+hold. For each $n$ there exists a mean-zero Gaussian process $Z^L_n$ indexed
+on $\cW$ satisfying
+$\E\big[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w) \big| \big]
+\lesssim \frac{\Du \log n}{\sqrt{n}}$, where
+$\E[Z_n^L(w)Z_n^L(w')] = n\E[L_n(w)L_n(w')]$ for all $w, w' \in \cW$. The
+process $Z_n^L$ is a function only of $\bA_n$ and some random noise
+independent of $(\bA_n, \bV_n)$.
+\end{lemma}
+
+% donsker case
+The strong approximation result in Lemma~\ref{lem:kernel_strong_approx_Ln}
+would be
+sufficient to develop valid and even optimal uniform inference procedures
+whenever both $\Dl > 0$ (no degeneracy in $L_n$) and $n h \gg \log n$
+($L_n$ is leading). In this special case, the recent Donsker-type results of
+\citet{davezies2021exchangeable} can be applied to analyze the limiting
+distribution of the stochastic process $\hat{f}_W$. Alternatively, again only
+when $L_n$ is non-degenerate and leading, standard empirical process methods
+could also be used. However, even in the special case when $\hat{f}_W(w)$ is
+asymptotically Donsker, our result in Lemma~\ref{lem:kernel_strong_approx_Ln}
+improves
+upon the literature by providing a rate-optimal strong approximation for
+$\hat{f}_W$ as opposed to only a weak convergence result. See Theorem
+\ref{thm:kernel_infeasible_ucb} and the subsequent discussion below.
+
+% however often non-donsker
+More importantly, as illustrated above, it is common in the literature to find
+dyadic distributions which exhibit partial or total degeneracy, making the
+process $\hat{f}_W$ non-Donsker. Thus approximating only $L_n$ is in general
+insufficient for valid uniform inference, and it is necessary to capture the
+distributional properties of $E_n$ as well.
+% we do better
+The following lemma is an application of our strong approximation result for
+empirical processes based on the Yurinskii approximation, which builds on a
+refinement by \citet{belloni2019conditional}.
+
+\begin{lemma}[Conditional strong approximation of $E_n$]
+\label{lem:kernel_conditional_strong_approx_En}
+%
+Suppose Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold
+and take any $R_n \to \infty$. For each $n$ there exists $\tilde Z^E_n$
+a mean-zero Gaussian process conditional on $\bA_n$ satisfying
+$\sup_{w \in \cW}
+\big| \sqrt{n^2h} E_n(w) - \tilde Z_n^E(w) \big|
+\lesssim_\P \frac{(\log n)^{3/8} R_n}{n^{1/4}h^{3/8}}$,
+where $\E[\tilde Z_n^E(w)\tilde Z_n^E(w')\bigm\vert \bA_n]
+=n^2h\E[E_n(w)E_n(w')\bigm\vert \bA_n]$
+for all $w, w' \in \cW$.
+%
+\end{lemma}
+
+The process $\tilde Z_n^E$ is a Gaussian process conditional on $\bA_n$ but is
+not in general a Gaussian process unconditionally. The following lemma
+constructs an unconditional Gaussian process $Z_n^E$ that approximates
+$\tilde Z_n^E$.
+
+\begin{lemma}[Unconditional strong approximation of $E_n$]
+\label{lem:kernel_unconditional_strong_approx_En}
+
+Suppose that Assumptions \ref{ass:kernel_data} and
+\ref{ass:kernel_bandwidth} hold. For each $n$ there exists
+a mean-zero Gaussian process $Z^E_n$ satisfying
+$\E\big[ \sup_{w \in \cW} \big|\tilde Z_n^E(w) - Z_n^E(w)\big| \big]
+\lesssim \frac{(\log n)^{2/3}}{n^{1/6}}$,
+where $Z_n^E$ is independent of $\bA_n$ and
+$\E[Z_n^E(w)Z_n^E(w')]=\E[\tilde Z_n^E(w)\tilde Z_n^E(w')]
+= n^2h \, \E[E_n(w)E_n(w')]$ for all $w, w' \in \cW$.
+%
+\end{lemma}
+
+Combining Lemmas \ref{lem:kernel_conditional_strong_approx_En}
+and~\ref{lem:kernel_unconditional_strong_approx_En}, we obtain
+an unconditional strong
+approximation for $E_n$. The resulting rate of approximation may not be
+optimal, due to the Yurinskii coupling, but to the best of our knowledge it is
+the first in the literature for the process $E_n$, and hence for $\hat{f}_W$
+and its associated $t$-process in the context of dyadic data. The approximation
+rate is sufficiently fast to allow for optimal bandwidth choices; see Section
+\ref{sec:kernel_implementation} for more details. Strong approximation results
+for
+local empirical processes (e.g.\ \citealp{gine2010confidence}) are not
+applicable here because the summands in the non-negligible $E_n$ are not
+(conditionally) i.i.d. Likewise, neither standard empirical process and
+U-process theory \citep{van1996weak,gine2021mathematical} nor the recent
+results in \citet{davezies2021exchangeable} are applicable to the non-Donsker
+process $E_n$.
+
+The previous lemmas showed that $L_n$ is $\sqrt{n}$-consistent while $E_n$ is
+$\sqrt{n^2h}$-consistent (pointwise in $w$), showcasing the importance of
+careful standardization (cf.\ Studentization in
+Section~\ref{sec:kernel_implementation}) for the purpose of rate adaptivity to
+the
+unknown degeneracy type. In other words, a challenge in conducting uniform
+inference is that the finite-dimensional distributions of the stochastic
+process $L_n+E_n$, and hence those of $\hat{f}_W$ and its associated
+$t$-process $T_n$, may converge at different rates at different points
+$w\in\cW$. The following theorem provides an (infeasible) inference procedure
+which is fully adaptive to such potential unknown degeneracy.
+
+\begin{theorem}[Strong approximation of $T_n$]
+\label{thm:kernel_strong_approx_Tn}
+
+Suppose that Assumptions~\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
+hold and $f_W(w) > 0$ on $\cW$, and take any $R_n \to \infty$. Then for each
+$n$ there exists a centered Gaussian process $Z_n^{T}$ such that
+%
+\begin{align*}
+&\sup_{w \in \cW} \left| T_n(w) - Z_n^{T}(w) \right|
+\lesssim_\P \!
+\frac{
+n^{-1} \! \log n
++ n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-7/6} h^{-1/2} (\log n)^{2/3}
++ h^{p\wedge\beta}}
+{\Dl/\sqrt{n} + 1/\sqrt{n^2h}},
+\end{align*}
+%
+where $\E[Z_n^T(w)Z_n^T(w')] = \E[T_n(w)T_n(w')]$ for all $w,w' \in \cW$.
+%
+\end{theorem}
+
+The first term in the numerator corresponds to the strong approximation for
+$L_n$ in Lemma~\ref{lem:kernel_strong_approx_Ln} and the error introduced by
+$Q_n$.
+The second and third terms correspond to the conditional and unconditional
+strong approximation errors for $E_n$ in Lemmas
+\ref{lem:kernel_conditional_strong_approx_En} and
+\ref{lem:kernel_unconditional_strong_approx_En}.
+The fourth term is from
+the smoothing bias result in Theorem~\ref{thm:kernel_bias}. The denominator is
+the lower bound on the standard deviation $\Sigma_n(w,w)^{1/2}$ formulated in
+Lemma~\ref{lem:kernel_variance_bounds}.
+
+In the absence of degenerate points ($\Dl > 0$) and if $n h^{7/2}\gtrsim 1$,
+Theorem~\ref{thm:kernel_strong_approx_Tn} offers a strong approximation of the
+$t$-process at the rate $(\log n)/\sqrt{n}+\sqrt{n}h^{p\wedge\beta}$, which
+matches the celebrated KMT approximation rate for i.i.d.\ data plus the
+smoothing bias. Therefore, our novel $t$-process strong approximation can
+achieve the optimal KMT rate for non-degenerate dyadic distributions provided
+that $p\wedge\beta \geq 3.5$. This is achievable if a fourth-order
+(boundary-adaptive) kernel is used and $f_W$ is sufficiently smooth.
+
+In the presence of partial or total degeneracy ($\Dl =0$),
+Theorem~\ref{thm:kernel_strong_approx_Tn} provides a strong approximation for
+the
+$t$-process at the rate
+$\sqrt{h}\log n + n^{-1/4}h^{-3/8}(\log n)^{3/8} R_n + n^{-1/6}(\log n)^{2/3}
++ n h^{1/2+p\wedge\beta}$. If, for example, $n h^{p\wedge\beta}\lesssim 1$,
+then our result can achieve a strong approximation rate of $n^{-1/7}$ up to
+$\log n $ terms. Theorem~\ref{thm:kernel_strong_approx_Tn} appears to be the
+first in the dyadic literature which is also robust to the presence of
+degenerate points in the underlying dyadic distribution.
+
+\subsection{Application: confidence bands}
+
+Theorem~\ref{thm:kernel_infeasible_ucb} constructs standardized confidence
+bands for
+$f_W$ which are infeasible as they depend on the unknown population variance
+$\Sigma_n$. In Section~\ref{sec:kernel_implementation} we will make this
+inference
+procedure feasible by proposing a valid estimator of the covariance function
+$\Sigma_n$ for Studentization, as well as developing bandwidth selection and
+robust bias correction methods. Before presenting our result on valid
+infeasible uniform confidence bands, we first impose in
+Assumption~\ref{ass:kernel_rates} some extra restrictions on the bandwidth
+sequence,
+which depend on the degeneracy type of the dyadic distribution, to ensure the
+coverage rate converges.
+
+\begin{assumption}[Rate restriction for uniform confidence bands]
+\label{ass:kernel_rates}
+Assume that one of the following holds:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+\label{it:kernel_rate_non}
+No degeneracy ($\Dl > 0$):
+$n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$,
+
+\item
+\label{it:kernel_rate_degen}
+Partial or total degeneracy ($\Dl = 0$):
+$n^{-2/3} (\log n)^{7/3} \ll h
+\ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$.
+\end{enumerate}
+\end{assumption}
+
+We now construct the infeasible uniform confidence bands.
+For $\alpha \in (0,1)$, let $q_{1-\alpha}$ be the quantile satisfying
+$ \P\left(\sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right)
+= 1 - \alpha$.
+The following result employs the anti-concentration idea due to
+\citet{chernozhukov2014anti} to deduce valid standardized confidence bands,
+where we approximate the quantile of the unknown finite sample distribution of
+$\sup_{w\in\cW} |T_n(w)|$ by the quantile $q_{1-\alpha}$ of
+$\sup_{w\in\cW}|Z_n^T(w)|$. This approach offers a better rate of convergence
+than relying on extreme value theory for the distributional approximation,
+hence improving the finite sample performance of the proposed confidence bands.
+
+\begin{theorem}[Infeasible uniform confidence bands]
+\label{thm:kernel_infeasible_ucb}
+
+Suppose that Assumptions~\ref{ass:kernel_data},~\ref{ass:kernel_bandwidth},
+and~\ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then
+%
+\begin{align*}
+\P\left(
+f_W(w) \in
+\left[ \hat f_W(w) \pm q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \, \right]
+\, \textup{for all } w \in \cW
+\right)
+\to 1 - \alpha.
+\end{align*}
+%
+\end{theorem}
+
+By Theorem~\ref{thm:kernel_uniform_consistency}, the asymptotically optimal
+choice of
+bandwidth for uniform convergence is
+$h \asymp ((\log n)/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$.
+As discussed in the next section, the approximate
+IMSE-optimal bandwidth is $h \asymp (1/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$.
+Both bandwidth choices satisfy Assumption~\ref{ass:kernel_rates} only in the
+case of
+no degeneracy. The degenerate cases in
+Assumption~\ref{ass:kernel_rates}\ref{it:kernel_rate_degen}, which require
+$p \wedge \beta > 1$, exhibit behavior more similar to that of standard
+nonparametric kernel-based estimation and so the aforementioned optimal
+bandwidth choices will lead to a non-negligible smoothing bias in the
+distributional approximation for $T_n$. Different approaches are available in
+the literature to address this issue, including undersmoothing or ignoring the
+bias \citep{hall2001bootstrapping}, bias correction \citep{hall1992effect},
+robust bias correction \citep{calonico2018effect, calonico2022coverage}, and
+Lepskii's method
+\citep{lepskii1992asymptotically,birge2001alternative}, among others. In the
+next section we develop a feasible uniform inference procedure, based on robust
+bias correction methods, which amounts to first selecting an optimal bandwidth
+for the point estimator $\hat{f}_W$ using a $p$th-order kernel, and then
+correcting the bias of the point estimator while also adjusting the
+standardization (Studentization) when forming the $t$-statistic $T_n$.
+
+Importantly, regardless of the specific implementation details,
+Theorem~\ref{thm:kernel_infeasible_ucb} shows that any bandwidth sequence $h$
+satisfying both \ref{it:kernel_rate_non} and \ref{it:kernel_rate_degen}
+in Assumption~\ref{ass:kernel_rates} leads to valid uniform inference which is
+robust
+and adaptive to the (unknown) degeneracy type.
+
+\section{Implementation}
+\label{sec:kernel_implementation}
+
+We address outstanding implementation details to make our main uniform
+inference results feasible. In Section~\ref{sec:kernel_covariance_estimation} we
+propose a covariance estimator along with a modified version which is
+guaranteed to be positive semi-definite. This allows for the construction of
+fully feasible confidence bands in
+Section~\ref{sec:kernel_feasible_confidence_bands}.
+In Section~\ref{sec:kernel_bandwidth_selection} we discuss bandwidth selection
+and
+formalize our procedure for robust bias correction inference.
+
+\subsection{Covariance function estimation}
+\label{sec:kernel_covariance_estimation}
+
+Define the following plug-in covariance function
+estimator of $\Sigma_n$. For $w, w' \in \cW$,
+let $S_i(w) = \frac{1}{n-1} \big( \sum_{j = 1}^{i-1} k_h(W_{j i}, w)
++ \sum_{j = i+1}^n k_h(W_{i j}, w) \big)$
+estimate $\E[k_h(W_{i j},w) \mid A_i]$ and take
+%
+\begin{align*}
+\hat \Sigma_n(w,w')
+&= \frac{4}{n^2} \sum_{i=1}^n S_i(w) S_i(w')
+- \frac{4}{n^2(n-1)^2} \sum_{i<j} k_h(W_{i j},w) k_h(W_{i j},w') \\
+&\quad- \frac{4n-6}{n(n-1)} \hat f_W(w) \hat f_W(w').
+\end{align*}
+%
+Though $\hat\Sigma_n(w,w')$ is consistent in an appropriate sense as shown in
+Lemma~\ref{lem:kernel_sdp}, it is not necessarily positive semi-definite, even
+in the limit. We therefore propose a modified covariance estimator which is
+guaranteed to be positive semi-definite. Specifically, consider the following
+optimization problem where $C_\rk$ and $C_\rL$ are as in
+Section~\ref{sec:kernel_setup}.
+%
+\begin{equation}
+\label{eq:kernel_sdp}
+\begin{aligned}
+\minimize
+\qquad
+& \sup_{w,w' \in \cW}
+\left|
+\frac{M(w,w') - \hat\Sigma_n(w,w')}
+{\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
+\right|
+\quad \textup{ over } M: \cW \times \cW \to \R
+\\
+\subjectto
+\qquad
+& M \textup{ is symmetric and positive semi-definite}, \\
+& \big|M(w,w') - M(w, w'')\big|
+\leq \frac{4}{n h^3}
+C_\rk C_\rL
+|w'-w''|
+\textup{ for all }
+w, w', w'' \in \cW.
+\end{aligned}
+\end{equation}
+
+Denote by $\hat\Sigma_n^+$ any (approximately) optimal solution to
+\eqref{eq:kernel_sdp}. The following lemma establishes uniform convergence rates
+for both $\hat \Sigma_n$ and $\hat \Sigma_n^+$.
+We then use $\hat \Sigma_n^+$ to construct feasible versions of $T_n$ and its
+associated Gaussian approximation $Z_n^{T}$ defined in
+Theorem~\ref{thm:kernel_strong_approx_Tn}.
+%
+\begin{lemma}[Consistency of $\hat \Sigma_n$ and $\hat \Sigma_n^+$]
+\label{lem:kernel_sdp}
+Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold
+and that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Then
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left| \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right|
+&\lesssim_\P \frac{\sqrt{\log n}}{n}.
+\end{align*}
+%
+The optimization problem \eqref{eq:kernel_sdp} is a semi-definite
+program
+\citep[SDP,][]{laurent2005semidefinite} and has an approximately optimal
+solution $\hat\Sigma_n^+$ satisfying
+%
+\begin{align*}
+\sup_{w,w' \in \cW} \left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right|
+&\lesssim_\P \frac{\sqrt{\log n}}{n}.
+\end{align*}
+%
+\end{lemma}
+
+In practice we take $w, w' \in \cW_d$ where $\cW_d$ is a finite subset of
+$\cW$, typically taken to be an equally-spaced grid. This yields
+finite-dimensional covariance matrices, for which \eqref{eq:kernel_sdp} can be
+solved
+in polynomial time in $|\cW_d|$ using a general-purpose SDP solver
+\citep[e.g.\ by interior point methods,][]{laurent2005semidefinite}.
+The number of points in $\cW_d$ should be taken as large as is computationally
+practical in order to generate confidence bands rather than merely simultaneous
+confidence intervals. It is worth noting that the complexity of solving
+\eqref{eq:kernel_sdp} does not depend on the number of vertices $n$, and so
+does not
+influence the ability of our methodology to handle large and possibly sparse
+networks.
+
+The bias-corrected variance estimator in
+\citet[Section~3.2]{matsushita2021jackknife} takes a similar form to our
+estimator
+$\hat\Sigma_n$ but in the parametric setting, and is therefore also not
+guaranteed to be positive semi-definite in finite samples. Our approach
+addresses this issue, ensuring a positive semi-definite estimator
+$\hat\Sigma_n^+$ is always available.
+
+\subsection{Feasible confidence bands}
+\label{sec:kernel_feasible_confidence_bands}
+
+Given a choice of the kernel order $p$ and a bandwidth $h$, we construct a
+valid confidence band that is implementable in practice. Define the Studentized
+$t$-statistic process
+%
+\begin{align*}
+\hat T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\hat \Sigma_n^+(w,w)}}.
+\end{align*}
+%
+Let $\hat Z_n^T(w)$ be a process which, conditional on the data $\bW_n$,
+is mean-zero and Gaussian, whose conditional covariance structure is
+$\E\big[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm\vert \bW_n \big]
+= \frac{\hat \Sigma_n^+(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$.
+For $\alpha \in (0,1)$, let $\hat q_{1-\alpha}$ be the
+conditional quantile satisfying
+$\P\big(\sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq \hat q_{1-\alpha}
+\bigm\vert \bW_n \big) = 1 - \alpha$,
+which is shown to be well defined in Section~\ref{sec:kernel_app_proofs}.
+
+\begin{theorem}[Feasible uniform confidence bands]
+\label{thm:kernel_ucb}
+
+Suppose that Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth},
+and \ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then
+%
+\begin{align*}
+\P\left(
+f_W(w) \in
+\left[ \hat f_W(w) \pm \hat q_{1-\alpha}
+\sqrt{\hat\Sigma_n^+(w,w)} \,\right]
+\,\textup{for all } w \in \cW
+\right) \to 1 - \alpha.
+\end{align*}
+%
+\end{theorem}
+
+Recently, \citet{chiang2022inference} derived high-dimensional central limit
+theorems over rectangles for exchangeable arrays and applied them to construct
+simultaneous confidence intervals for a sequence of design points. Their
+inference procedure relies on the multiplier bootstrap, and their conditions
+for valid inference depend on the number of design points considered. In
+contrast, Theorem~\ref{thm:kernel_ucb} constructs a feasible uniform confidence
+band over the entire domain of inference $\cW$ based on our strong
+approximation results for the whole $t$-statistic process and the covariance
+estimator $\hat\Sigma_n^+$. The required rate condition specified in
+Assumption~\ref{ass:kernel_rates} does not depend on the number of design
+points.
+Furthermore, our proposed inference methods are robust to potential unknown
+degenerate points in the underlying dyadic data generating process.
+
+In practice, suprema over $\cW$ can be replaced by maxima over sufficiently
+many design points in $\cW$. The conditional quantile $\hat q_{1-\alpha}$ can
+be estimated by Monte Carlo simulation, resampling from the Gaussian process
+defined by the law of $\hat Z_n^T \mid \bW_n$.
+
+The bandwidth restrictions in Theorem~\ref{thm:kernel_ucb} are the same as
+those for the infeasible version given in
+Theorem~\ref{thm:kernel_infeasible_ucb},
+namely those imposed in Assumption \ref{ass:kernel_rates}. This follows from
+the rates
+of convergence obtained in Lemma~\ref{lem:kernel_sdp}, coupled with some careful
+technical work given in Section~\ref{sec:kernel_app_proofs} to handle the
+potential
+presence of degenerate points in $\Sigma_n$.
+
+\subsection{Bandwidth selection and robust bias-corrected inference}
+\label{sec:kernel_bandwidth_selection}
+
+We give practical suggestions for selecting the bandwidth parameter $h$.
+Let $\nu(w)$ be a non-negative real-valued function on $\cW$ and suppose we use
+a kernel of order $p < \beta$ of the form $k_h(s,w) = K\big((s-w) / h\big)/h$.
+The $\nu$-weighted asymptotic IMSE (AIMSE) is minimized by
+%
+\begin{align*}
+h^*_{\AIMSE}
+&=
+\left(
+\frac{p!(p-1)!
+\Big(\int_\cW f_W(w) \nu(w) \diff{w}\Big)
+\Big(\int_\R K(w)^2 \diff{w}\Big)}
+{2 \Big(
+\int_{\cW}
+f_W^{(p)}(w)^2
+\nu(w)
+\diff{w}
+\Big)
+\Big(
+\int_\R
+w^p K(w)
+\diff{w}
+\Big)^2
+}
+\right)^{\frac{1}{2p+1}}
+\left( \frac{n(n-1)}{2} \right)^{-\frac{1}{2p+1}}.
+\end{align*}
+%
+This is akin to the AIMSE-optimal bandwidth choice for traditional monadic
+kernel density estimation with a sample size of $\frac{1}{2}n(n-1)$. The choice
+$h^*_{\AIMSE}$ is slightly undersmoothed (up to a polynomial $\log n$ factor)
+relative to the uniform minimax-optimal bandwidth choice discussed in
+Section~\ref{sec:kernel_point_estimation}, but it is easier to implement in
+practice.
+
+To implement the AIMSE-optimal bandwidth choice, we propose a simple
+rule-of-thumb (ROT) approach based on Silverman's rule.
+Suppose $p\wedge\beta=2$ and let $\hat\sigma^2$ and $\hat I$
+be the sample variance and sample interquartile range respectively
+of the data $\bW_n$. Then
+$\hat{h}_{\ROT} = C(K) \big( \hat\sigma \wedge
+\frac{\hat I}{1.349} \big) \big(\frac{n(n-1)}{2} \big)^{-1/5}$,
+where we have $C(K)=2.576$ for the triangular kernel $K(w) = (1 - |w|) \vee 0$,
+and $C(K)=2.435$ for the Epanechnikov kernel
+$K(w) = \frac{3}{4}(1 - w^2) \vee 0$.
+
+The AIMSE-optimal bandwidth selector $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$
+and any of its feasible estimators only satisfy
+Assumption~\ref{ass:kernel_rates} in
+the case of no degeneracy ($\Dl>0$). Under partial or total degeneracy, such
+bandwidths are not valid due to the usual leading smoothing (or
+misspecification) bias of the distributional approximation. To circumvent this
+problem and construct feasible uniform confidence bands for $f_W$, we employ
+the following robust bias correction approach.
+
+\begin{algorithm}[b!]
+\caption{Feasible uniform confidence bands}
+\label{alg:kernel_method}
+\setstretch{1.5}
+
+Choose a kernel $k_h$ of order $p \geq 2$ satisfying
+Assumption~\ref{ass:kernel_bandwidth}. \\
+
+Select a bandwidth $h \approx h^*_{\AIMSE}$ for $k_h$
+as in Section~\ref{sec:kernel_bandwidth_selection},
+perhaps using $h = \hat{h}_{\ROT}$. \\
+
+Choose another kernel $k_h'$ of order $p'>p$ satisfying
+Assumption~\ref{ass:kernel_bandwidth}.
+
+For $d \geq 1$, choose a set of $d$ distinct evaluation points $\cW_d$. \\
+
+For each $w \in \cW_d$, construct the density estimate $\hat f_W(w)$
+using $k'_{h}$ as in Section~\ref{sec:kernel_introduction}. \\
+
+For $w, w' \in \cW_d$, estimate the covariance $\hat \Sigma_n(w,w')$
+using $k'_{h}$ as in Section~\ref{sec:kernel_covariance_estimation}. \\
+
+Construct positive semi-definite
+covariance estimate $\hat \Sigma_n^+$
+as in Section~\ref{sec:kernel_covariance_estimation}. \\
+
+For $B \geq 1$, let $(\hat Z_{n,r}^T: 1\leq r\leq B)$ be i.i.d.\
+from $\hat{Z}_n^T$ as in Section~\ref{sec:kernel_feasible_confidence_bands}.
+\\
+
+For $\alpha \in (0,1)$, set
+$\hat q_{1-\alpha} = \inf_{q \in \R}
+\{ q : \# \{r: \max_{w\in\cW_d}|\hat Z_{n,r}^T(w)| \leq q \}
+\geq B(1-\alpha) \}$. \\
+
+Construct $ \big[\hat f_W(w) \pm
+\hat q_{1-\alpha} \hat\Sigma_n^+(w,w)^{1/2} \big]$ for each $w \in \cW_d$.
+%
+\end{algorithm}
+
+Firstly, estimate the bandwidth $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ using a
+kernel of order $p$, which leads to an AIMSE-optimal point estimator
+$\hat{f}_W$ in an $L^2(\nu)$ sense. Then use this bandwidth and a kernel of
+order $p' > p$ to construct the statistic $\hat T_n$ and the confidence band as
+detailed in Section~\ref{sec:kernel_feasible_confidence_bands}. Importantly,
+both
+$\hat{f}_W$ and $\hat{\Sigma}^+_n$ are recomputed with the new higher-order
+kernel. The change in centering is equivalent to a bias correction of the
+original AIMSE-optimal point estimator, while the change in scale captures the
+additional variability introduced by the bias correction itself. As shown
+formally in \citet{calonico2018effect, calonico2022coverage} for the case of
+kernel-based density
+estimation with i.i.d.\ data, this approach leads to higher-order refinements
+in the distributional approximation whenever additional smoothness is available
+($p'\leq\beta$). In the present dyadic setting, this procedure is valid so long
+as $n^{-2/3} (\log n)^{7/3} \ll n^{-\frac{2}{2p+1}}
+\ll (n^2 \log n)^{-\frac{1}{2p' + 1}}$,
+which is equivalent to $2 \leq p < p'$.
+For concreteness, we recommend taking $p = 2$ and $p' = 4$,
+and using the rule-of-thumb bandwidth choice $\hat{h}_{\ROT}$ defined above.
+In particular, this approach automatically delivers a KMT-optimal
+strong approximation whenever there are no degeneracies in the
+underlying dyadic data generating process.
+Our feasible robust bias correction method based on AIMSE-optimal dyadic
+kernel density estimation for constructing uniform confidence bands
+for $f_W$ is summarized in Algorithm~\ref{alg:kernel_method}.
+
+\section{Simulations}
+\label{sec:kernel_simulations}
+
+We investigate the empirical finite-sample performance of the kernel density
+estimator with dyadic data using simulations. The family of dyadic
+distributions defined in Section~\ref{sec:kernel_degeneracy}, with its three
+parameterizations, is used to generate data sets with different degeneracy
+types.
+
+We use two different boundary bias-corrected Epanechnikov kernels of orders
+$p=2$ and $p=4$ respectively, on the inference domain $\cW = [-2,2]$. We select
+an optimal bandwidth for $p=2$ as recommended in
+Section~\ref{sec:kernel_bandwidth_selection}, using the rule-of-thumb with
+$C(K) = 2.435$. The semi-definite program in
+Section~\ref{sec:kernel_covariance_estimation} is solved with the MOSEK
+interior point
+optimizer \citep{mosek}, ensuring positive semi-definite covariance estimates.
+Gaussian vectors are resampled $B = 10\,000$ times.
+
+\begin{figure}[b!]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/outcome_plot_total.pdf}
+\caption{Total degeneracy, \\
+$\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/outcome_plot_partial.pdf}
+\caption{Partial degeneracy, \\
+$\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/outcome_plot_none.pdf}
+\caption{No degeneracy, \\
+$\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.}
+\end{subfigure}
+%
+\caption[Typical outcomes for different values of the parameter $\pi$]
+{Typical outcomes for three different values of the parameter $\pi$.}
+%
+\label{fig:kernel_results}
+%
+\end{figure}
+
+In Figure~\ref{fig:kernel_results} we plot a typical outcome for each of the
+three
+degeneracy types (total, partial, none), using the Epanechnikov kernel of order
+$p=2$, with sample size $n=100$ (so $N=4950$ pairs of nodes) and with $d=100$
+equally-spaced evaluation points. Each plot contains the true density function
+$f_W$, the dyadic kernel density estimate $\hat f_W$ and two different
+approximate $95\%$ confidence bands for $f_W$. The first is the uniform
+confidence band (UCB) constructed using one of our main results,
+Theorem~\ref{thm:kernel_ucb}. The second is a sequence of pointwise confidence
+intervals (PCI) constructed by finding a confidence interval for each
+evaluation point separately. We show only $10$ pointwise confidence intervals
+for clarity. In general, the PCIs are too narrow as they fail to provide
+simultaneous (uniform) coverage over the evaluation points. Note that under
+partial degeneracy the confidence band narrows near the degenerate point
+$w = 0$.
+
+\begin{table}[b!]
+\centering
+\begin{tabular}{|c|c|c|c|c|cc|cc|}
+\hline
+\multirow{2}{*}{$ \pi $}
+& \multirow{2}{*}{Degeneracy type}
+& \multirow{2}{*}{$ \hat h_{\ROT} $}
+& \multirow{2}{*}{$ p $}
+& \multirow{2}{*}{RIMSE}
+& \multicolumn{2}{|c|}{UCB}
+& \multicolumn{2}{|c|}{PCI} \\
+\cline{6-9}
+& & & &
+& CR & AW
+& CR & AW \\
+\hline
+\multirow{2}{*}{$ \left(\frac{1}{2}, 0, \frac{1}{2}\right) $}
+& \multirow{2}{*}{Total}
+& \multirow{2}{*}{0.161}
+& 2 & 0.00048 & 87.1\% & 0.0028 & 6.5\% & 0.0017 \\
+& & & 4 & 0.00068 & 95.2\% & 0.0042 & 9.7\% & 0.0025 \\
+\hline
+\multirow{2}{*}{$ \left(\frac{1}{4}, 0, \frac{3}{4}\right) $}
+& \multirow{2}{*}{Partial}
+& \multirow{2}{*}{0.158}
+& 2 & 0.00228 & 94.5\% & 0.0112 & 75.6\% & 0.0083 \\
+& & & 4 & 0.00234 & 94.7\% & 0.0124 & 65.3\% & 0.0087 \\
+\hline
+\multirow{2}{*}{$ \left(\frac{1}{5}, \frac{1}{5}, \frac{3}{5}\right) $}
+& \multirow{2}{*}{None}
+& \multirow{2}{*}{0.145}
+& 2 & 0.00201 & 94.2\% & 0.0106 & 73.4\% & 0.0077 \\
+& & & 4 & 0.00202 & 95.6\% & 0.0117 & 64.3\% & 0.0080 \\
+\hline
+\end{tabular}
+\caption[Numerical results for three values of the parameter $\pi$]{
+Numerical results for three values of the parameter $\pi$.}
+\label{tab:kernel_results}
+\end{table}
+
+Next, Table~\ref{tab:kernel_results} presents numerical results. For each
+degeneracy
+type (total, partial, none) and each kernel order ($p=2$, $p=4$), we run $2000$
+repeats with sample size $n=3000$ (giving $N=4\,498\,500$ pairs of nodes) and
+with $d=50$ equally-spaced evaluation points. We record the average
+rule-of-thumb bandwidth $\hat{h}_{\ROT}$ and the average root integrated mean
+squared error (RIMSE). For both the uniform confidence bands (UCB) and the
+pointwise confidence intervals (PCI), we report the coverage rate (CR) and the
+average width (AW).
+%
+The lower-order kernel ($p=2$) ignores the bias, leading to good RIMSE
+performance and acceptable UCB coverage under partial or no degeneracy, but
+gives invalid inference under total degeneracy. In contrast, the higher-order
+kernel ($p=4$) provides robust bias correction and hence improves the coverage
+of the UCB in every regime, particularly under total degeneracy, at the cost of
+increasing both the RIMSE and the average widths of the confidence bands.
+%
+As expected, the pointwise (in $w\in\cW$) confidence intervals (PCIs) severely
+undercover in every regime. Thus our simulation results show that the proposed
+feasible inference methods based on robust bias correction and proper
+Studentization deliver valid uniform inference which is robust to unknown
+degenerate points in the underlying dyadic distribution.
+
+\section{Counterfactual dyadic density estimation}
+\label{sec:kernel_counterfactual}
+
+To further showcase the applicability of our main results, we develop a kernel
+density estimator for dyadic counterfactual distributions. The aim of such
+counterfactual analysis is to estimate the distribution of an outcome variable
+had some covariates followed a distribution different from the actual one, and
+it is important in causal inference and program evaluation settings
+\citep{dinardo1996distribution,chernozhukov2013inference}.
+
+For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be random
+variables as defined in Assumption~\ref{ass:kernel_data} and
+$\bX_n^r = (X_1^r, \ldots, X_n^r)$ be some covariates.
+We assume that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$
+and that $\bX_n^r$ is independent of $\bV_n^r$, that
+$W_{i j}^r \mid X_i^r, X_j^r$ has a conditional Lebesgue density
+$f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$,
+that $X_i^r$ follows a distribution function $F_X^r$ on a common support $\cX$,
+and that $(\bA_n^0, \bV_n^0, \bX_n^0)$
+is independent of $(\bA_n^1, \bV_n^1, \bX_n^1)$.
+
+We interpret $r$ as an index for two populations, labeled $0$ and $1$. The
+counterfactual density of population $1$ had it followed the
+same covariate distribution as population $0$ is
+%
+\begin{align*}
+f_W^{1 \triangleright 0}(w)
+&= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] \\
+&= \int_{\cX} \int_{\cX} f_{W \mid XX}^{1}(w \mid x_1, x_2)
+\psi(x_1) \psi(x_2) \diff F_X^{1}(x_1) \diff F_X^{1}(x_2),
+\end{align*}
+%
+where $\psi(x) = \mathrm{d} F_X^0(x) / \mathrm{d} F_X^1(x)$ for $x \in \cX$
+is a Radon--Nikodym derivative. If $X^0_i$ and $X^1_i$ have Lebesgue densities,
+it is natural to consider a parametric model of the form
+$\mathrm{d} F_X^{r}(x)=f_X^r(x;\theta)\diff x$
+for some finite-dimensional parameter $\theta$.
+Alternatively, if the covariates $X_n^r$ are discrete and have a positive
+probability mass function $p_X^r(x)$ on a finite
+support $\cX$, the object of interest becomes
+$f_W^{1 \triangleright 0}(w)
+= \sum_{x_1 \in \cX} \sum_{x_2 \in \cX}
+f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2)
+p_X^{1}(x_1) p_X^{1}(x_2)$,
+where $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$.
+We consider discrete covariates for simplicity,
+and hence the counterfactual dyadic kernel density estimator is
+%
+\begin{align*}
+\hat f_W^{\,1 \triangleright 0}(w)
+&= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n
+\hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w),
+\end{align*}
+%
+where $\hat\psi(x) = \hat p_X^{\,0}(x) / \hat p_X^{\,1}(x)$ and
+$\hat p_X^{\,r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$,
+with $\I$ the indicator function.
+
+Section~\ref{sec:kernel_app_main} provides technical details:
+we show how an asymptotic linear representation for $\hat\psi(x)$ leads to a
+Hoeffding-type decomposition of $\hat f_W^{\,1 \triangleright 0}(w)$,
+which is then used to establish that $\hat f_W^{\,1 \triangleright 0}$
+is uniformly consistent for $f_W^{\,1 \triangleright 0}(w)$
+and also admits a Gaussian strong approximation, with the same rates of
+convergence as for the standard density estimator. Furthermore, define the
+covariance function of $\hat f_W^{\,1 \triangleright 0}(w)$ as
+$\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\big[
+\hat f_W^{\,1 \triangleright 0}(w),
+\hat f_W^{\,1 \triangleright 0}(w') \big]$,
+which can be estimated as follows. First let
+$\hat\kappa(X_i^0, X_i^1, x)
+= \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)}
+- \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat
+p_X^1(x)}{\hat p_X^1(x)}$
+be a plug-in estimate of the influence function for $\hat\psi(x)$
+and define the leave-one-out conditional expectation estimators
+$S_i^{1 \triangleright 0}(w)
+= \frac{1}{n-1} \big( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1)
++ \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \big)$
+and $\tilde S_i^{1 \triangleright 0}(w)
+= \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\}
+\hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w)$.
+Define the covariance estimator
+%
+\begin{align*}
+\hat\Sigma_n^{1 \triangleright 0}(w,w')
+&= \frac{4}{n^2} \sum_{i=1}^n
+\big(
+\hat\psi(X_i^1) S_i^{1 \triangleright 0}(w)
++ \tilde S_i^{1 \triangleright 0}(w)
+\big)
+\big(
+\hat\psi(X_i^1) S_i^{1 \triangleright 0}(w')
++ \tilde S_i^{1 \triangleright 0}(w')
+\big) \\
+&\quad-
+\frac{4}{n^3(n-1)}
+\sum_{i<j} k_h(W_{i j}^1, w) k_h(W_{i j}^1, w')
+\hat\psi(X_i^1)^2 \hat\psi(X_j^1)^2
+- \frac{4}{n}
+\hat f_W^{\,1 \triangleright 0}(w) \hat f_W^{\,1 \triangleright 0}(w').
+\end{align*}
+%
+We use a positive semi-definite approximation to
+$\hat\Sigma_n^{1 \triangleright 0}$, denoted by
+$\hat\Sigma_n^{+, 1 \triangleright 0}$,
+as in Section~\ref{sec:kernel_covariance_estimation}. To construct feasible
+uniform
+confidence bands, define a process $\hat Z_n^{T, 1 \triangleright 0}(w)$ which
+is conditionally mean-zero and Gaussian given the data $\bW_n^1$, $\bX_n^0$, and
+$\bX_n^1$, and whose conditional covariance structure is
+$\E\big[\hat Z_n^{T, 1 \triangleright 0}(w)
+\hat Z_n^{T, 1 \triangleright 0}(w')
+\bigm| \bW_n^1, \bX_n^0, \bX_n^1 \big]
+= \frac{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w')}
+{\sqrt{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w)
+\hat \Sigma_n^{+, 1 \triangleright 0}(w',w')}}$.
+For $\alpha \in (0,1)$, define
+$\hat q_{1-\alpha}^{\,1 \triangleright 0}$
+as the quantile satisfying
+$\P\big(\sup_{w \in \cW}\big| \hat Z_n^{T, 1 \triangleright 0}(w) \big|
+\leq \hat q_{1-\alpha}^{\,1 \triangleright 0}
+\bigm\vert \bW_n^1, \bX_n^0, \bX_n^1 \big)
+= 1 - \alpha$.
+Then if the covariance estimator is appropriately consistent,
+%
+\begin{align*}
+\P\left(
+f_W^{1 \triangleright 0}(w) \in
+\left[
+\hat f_W^{\,1 \triangleright 0}(w)
+\pm \hat q^{\,1 \triangleright 0}_{1-\alpha}
+\sqrt{\hat\Sigma_n^{+, 1 \triangleright 0}(w,w)}
+\,\right]
+\,\textup{for all } w \in \cW
+\right) \to 1 - \alpha,
+\end{align*}
+%
+giving feasible uniform inference methods, which are robust to unknown
+degeneracies, for counterfactual distribution analysis in dyadic data settings.
+
+\subsection{Application to trade data}
+\label{sec:kernel_trade_data}
+
+We illustrate the performance of our estimation and inference methods with a
+real-world data set. We use international bilateral trade data from the
+International Monetary Fund's Direction of Trade Statistics (DOTS), previously
+analyzed by \citet{head2014gravity} and \citet{chiang2022inference}. This data
+set contains information about the yearly trade flows among $n = 207$ economies
+($N = 21\,321$ pairs), and we focus on the years $1995$, $2000$, and $2005$.
+
+We define the \emph{trade volume} between countries $i$ and $j$ as the
+logarithm of the sum of the trade flow (in billions of US dollars) from $i$ to
+$j$ and the trade flow from $j$ to $i$. In each year several pairs of countries
+did not trade directly, yielding trade flows of zero and hence a trade volume
+of $-\infty$. We therefore assume that the distribution of trade volumes is a
+mixture of a point mass at $-\infty$ and a Lebesgue density on $\R$. The local
+nature of our estimator means that observations taking the value of $-\infty$
+can simply be removed from the data set.
+Table~\ref{tab:kernel_trade_network_stats}
+gives summary statistics for these trade networks, and shows how the networks
+become more connected over time, with edge density, average degree, and
+clustering coefficient increasing.
+
+\begin{table}[b!]
+\centering
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline
+Year & Nodes & Edges & Edge density & Average degree
+& Clustering coefficient \\
+\hline
+1995 & 207 & 11\,603 & 0.5442 & 112.1 & 0.7250 \\
+2000 & 207 & 12\,528 & 0.5876 & 121.0 & 0.7674 \\
+2005 & 207 & 12\,807 & 0.6007 & 123.7 & 0.7745 \\
+\hline
+\end{tabular}
+\caption[Summary statistics for the DOTS trade networks]{
+Summary statistics for the DOTS trade networks.}
+\label{tab:kernel_trade_network_stats}
+\end{table}
+
+For counterfactual analysis we use the gross domestic product (GDP) of each
+country as a covariate, using $10\%$-percentiles to group the values into $10$
+different levels for ease of estimation. This allows for a comparison of the
+observed distribution of trade at each year with, for example, the
+counterfactual distribution of trade had the GDP distribution remained as it
+was in $1995$. As such, we can measure how much of the change in trade
+distribution is attributable to a shift in the GDP distribution.
+
+To estimate the trade volume density function we use
+Algorithm~\ref{alg:kernel_method}
+with $d=100$ equally-spaced evaluation points in $[-10,10]$, using the
+rule-of-thumb bandwidth selector $\hat h_{\ROT}$ from
+Section~\ref{sec:kernel_bandwidth_selection} with $p=2$ and $C(K) = 2.435$. For
+inference we use an Epanechnikov kernel of order $p=4$ and resample the
+Gaussian process $B = 10\,000$ times. We also estimate the counterfactual trade
+distributions in 2000 and 2005 respectively, replacing the GDP distribution
+with that from 1995. For each year, Figure~\ref{fig:kernel_trade} plots the
+real and
+counterfactual density estimates along with their respective uniform confidence
+bands (UCB) at the nominal coverage rate of $95\%$. Our empirical results show
+that the counterfactual distribution drifts further from the truth in 2005
+compared with 2000, indicating a shift in the GDP distribution.
+
+\begin{figure}[t]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_1995.pdf}
+\caption{Year 1995, $\hat h_{\ROT} = 1.27$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_1995_2000.pdf}
+\caption{Year 2000, $\hat h_{\ROT} = 1.31$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_1995_2005.pdf}
+\caption{Year 2005, $\hat h_{\ROT} = 1.37$.}
+\end{subfigure}
+%
+\caption[Histogram-based estimation and inference for the DOTS data]{
+Real and counterfactual density estimates and confidence bands for
+the DOTS data with histogram-based covariate estimation.}
+%
+\label{fig:kernel_trade}
+%
+\end{figure}
+
+\begin{figure}[b!]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_gdp_1995.pdf}
+\caption{Year 1995}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_gdp_2000.pdf}
+\caption{Year 2000}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_gdp_2005.pdf}
+\caption{Year 2005}
+\end{subfigure}
+%
+\caption[Estimated GDP distributions for the DOTS data]{
+Estimated GDP distributions for the DOTS data using histograms and
+normal likelihood maximization.}
+%
+\label{fig:kernel_gdp}
+%
+\end{figure}
+
+In Figure~\ref{fig:kernel_gdp} we illustrate how, in the preliminary step of the
+counterfactual analysis, the distribution of log GDP is approximated using the
+histogram estimators $\hat p_X^{\,0}$ and $\hat p_X^{\,1}$ defined in
+Section~\ref{sec:kernel_counterfactual}. We also plot the density function of a
+normal distribution, fitted using maximum likelihood estimation, and this seems
+to capture the distribution of log GDP reasonably well. Such a parametric
+approach to the preliminary step may be favored in cases where a choice of
+model is clear or where the histogram estimators perform poorly.
+
+To demonstrate the relative robustness of our counterfactual analysis to the
+choice of preliminary estimation step, we provide results using a
+parametric estimator of the distribution of GDP.
+Figure~\ref{fig:kernel_trade_para}
+repeats the procedure used for Figure~\ref{fig:kernel_trade}, but this time
+replacing
+the histogram estimators by parametric estimators of the log GDP based on
+normal likelihood maximization. The point estimates are qualitatively similar,
+with the counterfactual distribution drifting in the same direction over time.
+The confidence bands are also similar, with the band based on the parametric
+fit being slightly narrower in general. This could be due to the more stringent
+model specification leading to less estimated variance in the fitted values.
+
+\begin{figure}[t]
+\centering
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995.pdf}
+\caption{Year 1995, $\hat h_{\ROT} = 1.27$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995_2000.pdf}
+\caption{Year 2000, $\hat h_{\ROT} = 1.31$.}
+\end{subfigure}
+%
+\begin{subfigure}{0.32\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/trade_plot_parametric_1995_2005.pdf}
+\caption{Year 2005, $\hat h_{\ROT} = 1.37$.}
+\end{subfigure}
+%
+\caption[Parametric likelihood-based estimation and
+inference for the DOTS data]{
+Real and counterfactual density estimates and confidence bands for
+the DOTS data with parametric covariate estimation.}
+%
+\label{fig:kernel_trade_para}
+%
+\end{figure}
+
+\section{Other applications and future work}
+\label{sec:kernel_future}
+
+To emphasize the broad applicability of our methods to network science
+problems, we present three application scenarios. The first concerns comparison
+of networks \citep{kolaczyk2009statistical}, while the second and third involve
+nonparametric and semiparametric dyadic regression respectively.
+
+Firstly, consider the setting where there are two independent networks with
+continuous dyadic covariates $\bW_n^0$ and $\bW_m^1$ respectively.
+Practitioners may wish to test if these two dyadic distributions are the same,
+that is, whether their density functions $f_W^0$ and $f_W^1$ are equal on their
+common support $\cW \subseteq \R$. We present a family of hypothesis tests for
+this scenario based on dyadic kernel density estimation. Let $\hat
+f_W^{\,0}(w)$ and $\hat f_W^{\,1}(w)$ be the associated (bias-corrected) dyadic
+kernel density estimators. Consider the test statistics $\tau_p$ for
+$1 \leq p \leq \infty$ where
+%
+\begin{align}
+\nonumber
+\tau_p^p
+&= \int_{-\infty}^{\infty}
+\left| \hat f_W^{\,1}(w) - \hat f_W^{\,0}(w) \right|^p
+\diff w
+\ \text{ for } p < \infty, \\
+\label{eq:kernel_hypothesis_test}
+\tau_\infty
+&= \sup_{w \in \cW} \left| \hat f_W^{\,1}(w) - \hat f_W^{\,0}(w) \right|.
+\end{align}
+%
+Clearly, we should reject the null hypothesis that $f_W^0 = f_W^1$ whenever the
+test statistic $\tau_p$ is sufficiently large. To estimate the critical value,
+let $\hat\Sigma_n^{+,0}(w, w')$ and $\hat\Sigma_m^{+,1}(w, w')$ be the positive
+semi-definite estimators defined in
+Section~\ref{sec:kernel_covariance_estimation} and
+let $\hat Z^0_n(w)$ and $\hat Z^1_m(w)$ be zero-mean Gaussian processes with
+covariance structures $\hat\Sigma_n^{+,0}(w, w')$ and
+$\hat\Sigma_m^{+,1}(w, w')$ respectively, which are independent conditional on
+the data. Define the approximate null test statistic $\hat \tau_p$ by replacing
+$\hat f_W^{\,0}(w)$ and $\hat f_W^{\,1}(w)$ with $\hat Z^0_n(w)$ and
+$\hat Z^1_m(w)$ respectively in \eqref{eq:kernel_hypothesis_test}.
+For a significance level
+$\alpha \in (0,1)$, the critical value is $\hat C_\alpha$ where
+%
+$\P \big(
+\hat \tau_p \geq \hat C_\alpha \bigm\vert \bW_n^0, \bW_n^1
+\big) = \alpha$.
+%
+This is estimated by Monte Carlo simulation, resampling from the conditional
+law of $\hat Z^0_n(w)$ and $\hat Z^1_m(w)$ and replacing integrals and suprema
+by sums and maxima over a finite partition of $\cW$.
+
+While our focus has been on density estimation with dyadic data,
+our uniform dyadic estimation and inference results are readily applicable
+to the settings of nonparametric and semiparametric dyadic regression.
+For a second example, suppose $Y_{i j} = Y(X_i, X_j, A_i, A_j, V_{i j})$,
+where only $\bX_n$ and $\bY_n$ are observed and
+$\bV_n$ is independent of $(\bX_n, \bA_n)$,
+with $\bX_n = (X_i : 1 \leq i \leq n)$,
+$\bA_n = (A_i : 1 \leq i \leq n)$, $\bY_n = (Y_{i j}:1\leq i<j\leq n)$,
+and $\bV_n = (V_{i j} : 1 \leq i < j \leq n)$.
+A parameter of interest is the regression function
+$\mu(x_1, x_2) = \E[Y_{i j} \mid X_i=x_1, X_j=x_2]$,
+which can be used to analyze average or partial effects
+of changing the node attributes $X_i$ and $X_j$ on the edge variable $Y_{i j}$.
+This conditional expectation could be estimated using local polynomial methods:
+suppose that $X_i$ takes values in $\R^m$ and
+let $r(x_1, x_2)$ be a monomial basis up to degree
+$\gamma \geq 0$ on $\R^m \times \R^m$. Then, for some bandwidth $h > 0$ and
+a kernel function $k_h$ on $\R^m \times \R^m$,
+the local polynomial regression estimator of $\mu(x_1, x_2)$ is
+$\hat\mu(x_1, x_2) = e_1^\T \hat\beta(x_1, x_2)$ where
+$e_1$ is the first standard unit vector in $\R^q$ for
+$q=\binom{2m+\gamma}{\gamma}$ and
+%
+\begin{align}
+\nonumber
+\hat{\beta}(x_1, x_2)
+&=
+\argmin_{\beta \in \R^q}
+\sum_{i=1}^{n-1} \sum_{j=i+1}^n
+\left( Y_{i j} - r(X_i-x_1, X_j-x_2)^\T \beta \right)^2
+k_h(X_i-x_1, X_j-x_2) \\
+\label{eq:kernel_locpol}
+&=
+\left(
+\sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} r_{i j}^\T
+\right)^{-1}
+\left(
+\sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} Y_{i j}
+\right),
+\end{align}
+%
+with $k_{i j} = k_h(X_i-x_1, X_j-x_2)$ and $r_{i j} = r(X_i-x_1, X_j-x_2)$.
+\citet{graham2021minimax} established pointwise distribution theory
+for the special case of the dyadic Nadaraya--Watson kernel regression estimator
+($\gamma=0$), but no uniform analogues have yet been given. It can be shown
+that the ``denominator'' matrix in \eqref{eq:kernel_locpol} converges uniformly
+to its
+expectation, while the U-process-like ``numerator'' matrix can be handled the
+same way as we analyzed $\hat f_W(w)$ in this chapter, through a Hoeffding-type
+decomposition and strong approximation methods, along with standard bias
+calculations. Such distributional approximation results can be used to
+construct valid uniform confidence bands for the regression function
+$\mu(x_1, x_2)$, as well as to conduct hypothesis testing for parametric
+specifications or shape constraints.
+
+As a third example, we consider applying our results to semiparametric
+semi-linear regression problems. The dyadic semi-linear regression model is
+$\E[Y_{i j} \mid W_{i j}, X_i, X_j] = \theta^\T W_{i j} + g(X_i, X_j)$
+where $\theta$ is the finite-dimensional parameter of interest
+and $g(X_i, X_j)$ is an unknown function of the covariates $(X_i, X_j)$.
+Local polynomial (or other) methods can be used to estimate $\theta$ and $g$,
+where the estimator of the nonparametric component $g$ takes a similar form to
+\eqref{eq:kernel_locpol}, that is, a ratio of two kernel-based estimators as in
+\eqref{eq:kernel_estimator}. Consequently, the strong approximation techniques
+presented in this chapter can be appropriately modified to develop valid
+uniform inference procedures for $g$ and
+$\E[Y_{i j} \mid W_{i j}=w, X_i=x_1, X_j=x_2]$, as well as functionals thereof.
+
+\section{Conclusion}
+\label{sec:kernel_conclusion}
+
+We studied the uniform estimation and inference properties of the dyadic kernel
+density estimator $\hat{f}_W$ given in \eqref{eq:kernel_estimator}, which forms
+a class of U-process-like estimators indexed by the $n$-varying kernel function
+$k_h$ on $\cW$. We established uniform minimax-optimal point estimation results
+and uniform distributional approximations for this estimator based on novel
+strong approximation strategies. We then applied these results to derive valid
+and feasible uniform confidence bands for the dyadic density estimand $f_W$,
+and also developed a substantive application of our theory to counterfactual
+dyadic density analysis. We gave some other statistical applications of our
+methodology as well as potential avenues for future research. From a technical
+perspective, Appendix~\ref{app:kernel} contains several generic results
+concerning strong approximation methods and maximal inequalities for empirical
+processes that may be of independent interest. Implementations of this
+chapter's methodology, along with replication files for the empirical results,
+are provided by a Julia package available at
+\github{wgunderwood/DyadicKDE.jl}.
+This work is based on \citet{cattaneo2024uniform},
+and has been presented by Cattaneo at
+the Columbia University Biostatistics Colloquium Seminar (2022)
+and the Georgia Institute of Technology Statistics Seminar (2022),
+by Feng at
+the Renmin University Econometrics Seminar (2022),
+the Xiamen University Symposium on Modern Statistics (2022),
+the Peking University Econometrics Seminar (2023),
+and the Asian Meeting of the Econometric Society
+in East and Southeast Asia, Singapore (2023),
+and by Underwood at the University of Illinois Statistics Seminar (2024),
+the University of Michigan Statistics Seminar (2024), and the University of
+Pittsburgh Statistics Seminar (2024).
+
+\chapter[Yurinskii's Coupling for Martingales]%
+{Yurinskii's Coupling \\ for Martingales}
+\label{ch:yurinskii}
+
+% abstract
+Yurinskii's coupling is a popular theoretical tool for non-asymptotic
+distributional analysis in mathematical statistics and applied probability,
+offering a Gaussian strong approximation with an explicit error bound under
+easily verified conditions. Originally stated in $\ell^2$-norm for sums of
+independent random vectors, it has recently been extended both to the
+$\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in
+$\ell^2$-norm, under some strong conditions. We present as our main result a
+Yurinskii coupling for approximate martingales in $\ell^p$-norm, under
+substantially weaker conditions than those previously imposed. Our formulation
+further allows for the coupling variable to follow a more general Gaussian
+mixture distribution, and we provide a novel third-order coupling method which
+gives tighter approximations in certain settings. We specialize our main result
+to mixingales, martingales, and independent data, and derive uniform Gaussian
+mixture strong approximations for martingale empirical processes. Substantive
+applications of our theory to nonparametric partitioning-based and local
+polynomial regression procedures are provided.
+
+\section{Introduction}
+
+Yurinskii's coupling \citep{yurinskii1978error} has proven to be an important
+theoretical tool for developing non-asymptotic distributional approximations in
+mathematical statistics and applied probability. For a sum $S$ of $n$
+independent zero-mean $d$-dimensional random vectors, this coupling technique
+constructs (on a suitably enlarged probability space) a zero-mean
+$d$-dimensional Gaussian vector $T$ with the same covariance matrix as $S$ and
+which is close to $S$ in probability, bounding the discrepancy $\|S-T\|$ as a
+function of $n$, $d$, the choice of the norm, and some features of the
+underlying distribution. See, for example, \citet[Chapter 10]{pollard2002user}
+for a textbook introduction.
+
+When compared to other coupling approaches, such as the celebrated Hungarian
+construction \citep{komlos1975approximation} or Zaitsev's coupling
+\citep{zaitsev1987estimates,zaitsev1987gaussian}, Yurinskii's approach stands
+out for its simplicity, robustness, and wider applicability, while also
+offering tighter couplings in some applications (see below for more discussion
+and examples). These features have led many scholars to use Yurinskii's
+coupling to study the distributional features of high-dimensional statistical
+procedures in a variety of settings, often with the end goal of developing
+uncertainty quantification or hypothesis testing methods. For example, in
+recent years, Yurinskii's coupling has been used to construct Gaussian
+approximations for the suprema of empirical processes
+\citep{chernozhukov2014gaussian}; to establish distribution theory for
+non-Donsker stochastic $t$-processes generated in nonparametric series
+regression \citep{belloni2015some}; to prove distributional approximations for
+high-dimensional $\ell^p$-norms \citep{biau2015high}; to develop distribution
+theory for vector-valued martingales \citep{belloni2018high,li2020uniform}; to
+derive a law of the iterated logarithm for stochastic gradient descent
+optimization methods \citep{anastasiou2019normal}; to establish uniform
+distributional results for nonparametric high-dimensional quantile processes
+\citep{belloni2019conditional}; to develop distribution theory for non-Donsker
+stochastic $t$-processes generated in partitioning-based series regression
+\citep{cattaneo2020large}; to deduce Bernstein--von Mises theorems in
+high-dimensional settings \citep{ray2021bernstein}; and to develop distribution
+theory for non-Donsker U-processes based on dyadic network data
+\citep{cattaneo2024uniform}. There are also many other early applications of
+Yurinskii's coupling: \citet{dudley1983invariance} and \citet{dehling1983limit}
+establish invariance principles for Banach space-valued random variables, and
+\citet{lecam1988} and \citet{sheehy1992uniform} obtain uniform Donsker results
+for empirical processes, to name just a few.
+
+This chapter presents a new Yurinskii coupling which encompasses and improves
+upon all of the results previously available in the literature, offering four
+new features:
+%
+\begin{enumerate}[label=(\roman*),leftmargin=*]
+\item
+\label{it:yurinskii_contribution_approximate_martingale}
+It applies to vector-valued \textit{approximate martingale} data.
+\item
+\label{it:yurinskii_contribution_gaussian_mixture}
+It allows for a \textit{Gaussian mixture} coupling distribution.
+\item
+\label{it:yurinskii_contribution_degeneracy}
+It imposes \textit{no restrictions on degeneracy} of the
+data covariance matrix.
+\item
+\label{it:yurinskii_contribution_third_order}
+It establishes a \textit{third-order} coupling to
+improve the approximation in certain situations.
+\end{enumerate}
+%
+
+Closest to our work are the unpublished manuscript by \citet{belloni2018high}
+and the recent paper by \citet{li2020uniform}, which both investigated
+distribution theory for martingale data using Yurinskii's coupling and related
+methods. Specifically, \citet{li2020uniform} established a Gaussian
+$\ell^2$-norm Yurinskii coupling for mixingales and martingales under the
+assumption that the covariance structure has a minimum eigenvalue bounded away
+from zero. As formally demonstrated in this chapter
+(Section~\ref{sec:yurinskii_kde}),
+such eigenvalue assumptions can be prohibitively strong in practically relevant
+applications. In contrast, our Yurinskii coupling does not impose any
+restrictions on covariance degeneracy
+\ref{it:yurinskii_contribution_degeneracy}, in
+addition to offering several other new features not present in
+\citet{li2020uniform}, including
+\ref{it:yurinskii_contribution_approximate_martingale},
+\ref{it:yurinskii_contribution_gaussian_mixture},
+\ref{it:yurinskii_contribution_third_order}, and
+applicability to general $\ell^p$-norms. In addition, we correct a slight
+technical inaccuracy in their proof relating to the derivation of bounds in
+probability (Remark \ref{rem:yurinskii_coupling_bounds_probability}).
+\citet{belloni2018high} did not establish a Yurinskii coupling for martingales,
+but rather a central limit theorem for smooth functions of high-dimensional
+martingales using the celebrated second-order Lindeberg method
+\citep[see][and references therein]{chatterjee2006generalization}, explicitly
+accounting for covariance degeneracy. As a consequence, their result could be
+leveraged to deduce a Yurinskii coupling for martingales with additional,
+non-trivial technical work (see Section~\ref{sec:yurinskii_app_proofs}
+in Appendix~\ref{app:yurinskii} for details).
+Nevertheless, a Yurinskii coupling derived from
+\citet{belloni2018high} would not feature
+\ref{it:yurinskii_contribution_approximate_martingale},
+\ref{it:yurinskii_contribution_gaussian_mixture},
+\ref{it:yurinskii_contribution_third_order}, or
+general $\ell^p$-norms, as our results do. We discuss further the connections
+between our work and the related literature in the upcoming sections, both when
+introducing our main theoretical results and when presenting the examples and
+statistical applications.
+
+The most general coupling result of this chapter
+(Theorem~\ref{thm:yurinskii_sa_dependent}) is presented in
+Section~\ref{sec:yurinskii_main_results}, where we also specialize it to a
+slightly
+weaker yet more user-friendly formulation
+(Proposition~\ref{pro:yurinskii_sa_simplified}). Our Yurinskii coupling for
+approximate
+martingales is a strict generalization of all previous Yurinskii couplings
+available in the literature, offering a Gaussian mixture strong approximation
+for approximate martingale vectors in $\ell^p$-norm, with an improved rate of
+approximation when the third moments of the data are negligible, and with no
+assumptions on the spectrum of the data covariance matrix. A key technical
+innovation underlying the proof of Theorem~\ref{thm:yurinskii_sa_dependent} is
+that we
+explicitly account for the possibility that the minimum eigenvalue of the
+variance may be zero, or its lower bound may be unknown, with the argument
+proceeding using a carefully tailored regularization. Establishing a coupling
+to a Gaussian mixture distribution is achieved by an appropriate conditioning
+argument, leveraging a conditional version of Strassen's theorem established by
+\citet{chen2020jackknife}, along with some related technical work detailed in
+Section~\ref{sec:yurinskii_app_proofs}.
+A third-order coupling is obtained via
+a modification of a standard smoothing technique for Borel sets from classical
+versions of Yurinskii's coupling, enabling improved approximation errors
+whenever third moments are negligible.
+
+In Proposition~\ref{pro:yurinskii_sa_simplified}, we explicitly tune the
+parameters of
+the aforementioned regularization to obtain a simpler, parameter-free version
+of Yurinskii's coupling for approximate martingales, again offering Gaussian
+mixture coupling distributions and an improved third-order approximation error.
+This specialization of our main result takes an agnostic approach to potential
+singularities in the data covariance matrix and, as such, may be improved in
+specific applications where additional knowledge of the covariance structure is
+available. Section~\ref{sec:yurinskii_main_results} also presents some further
+refinements when additional structure is imposed, deriving Yurinskii couplings
+for mixingales, martingales, and independent data as
+Corollaries~\ref{cor:yurinskii_sa_mixingale},
+\ref{cor:yurinskii_sa_martingale}, and
+\ref{cor:yurinskii_sa_indep}, respectively. We take the opportunity to discuss
+and correct
+in Remark~\ref{rem:yurinskii_coupling_bounds_probability} a technical issue
+which is
+often neglected \citep{pollard2002user, li2020uniform} when using Yurinskii's
+coupling to derive bounds in probability. Section~\ref{sec:yurinskii_factor}
+presents a
+stylized example portraying the relevance of our main technical results in the
+context of canonical factor models, illustrating the importance of each of our
+new Yurinskii coupling features
+\ref{it:yurinskii_contribution_approximate_martingale}--%
+\ref{it:yurinskii_contribution_third_order}.
+
+Section~\ref{sec:yurinskii_emp_proc} considers a substantive application of our
+main
+results: strong approximation of martingale empirical processes. We begin with
+the motivating example of canonical kernel density estimation, demonstrating
+how Yurinskii's coupling can be applied, and showing in
+Lemma~\ref{lem:yurinskii_kde_eigenvalue} why it is essential that we do not
+place any
+conditions on the minimum eigenvalue of the variance matrix
+\ref{it:yurinskii_contribution_degeneracy}.
+We then present a general-purpose strong
+approximation for martingale empirical processes in
+Proposition~\ref{pro:yurinskii_emp_proc}, combining classical results in the
+empirical
+process literature \citep{van1996weak} with our
+Corollary~\ref{cor:yurinskii_sa_martingale}. This statement appears to be the
+first of
+its kind for martingale data, and when specialized to independent
+(and not necessarily identically distributed) data, it is
+shown to be superior to the best known comparable strong approximation result
+available in the literature \citep{berthet2006revisiting}. Our improvement
+comes from using Yurinskii's coupling for the $\ell^\infty$-norm, where
+\citet{berthet2006revisiting} apply Zaitsev's coupling
+\citep{zaitsev1987estimates, zaitsev1987gaussian} with the larger
+$\ell^2$-norm.
+
+Section~\ref{sec:yurinskii_nonparametric} further illustrates the applicability
+of our
+results through two examples in nonparametric regression estimation. Firstly,
+we deduce a strong approximation for partitioning-based least squares series
+estimators with time series data, applying
+Corollary~\ref{cor:yurinskii_sa_martingale}
+directly and additionally imposing only a mild mixing condition on the
+regressors. We show that our Yurinskii coupling for martingale vectors delivers
+the same distributional approximation rate as the best known result for
+independent data, and discuss how this can be leveraged to yield a feasible
+statistical inference procedure. We also show that if the residuals have
+vanishing conditional third moment, an improved rate of Gaussian approximation
+can be established. Secondly, we deduce a strong approximation for local
+polynomial estimators with time series data,
+using our result on martingale empirical processes
+(Proposition~\ref{pro:yurinskii_emp_proc}) and again imposing a mixing
+assumption.
+Appealing to empirical process theory is essential here as, in contrast with
+series estimators, local polynomials do not possess certain additive
+separability properties. The bandwidth restrictions we require are relatively
+mild, and, as far as we know, they have not been improved upon even with
+independent data.
+
+Section \ref{sec:yurinskii_conclusion} concludes the chapter.
+All proofs are collected in
+Appendix~\ref{app:yurinskii}, which also includes other technical lemmas
+of potential independent interest, alongside some further results on
+applications of our theory to deriving high-dimensional central limit theorems
+for martingales in Section~\ref{sec:yurinskii_app_high_dim_clt}.
+
+\subsection{Notation}
+
+We write $\|x\|_p$ for $p\in[1,\infty]$ to denote the $\ell^p$-norm if $x$ is a
+(possibly random) vector or the induced operator $\ell^p$--$\ell^p$-norm if $x$
+is a matrix. For $X$ a real-valued random variable and an Orlicz function
+$\psi$, we use $\vvvert X \vvvert_\psi$ to denote the Orlicz $\psi$-norm
+\citep[Section~2.2]{van1996weak} and $\vvvert X \vvvert_p$
+for the $L^p(\P)$-norm where
+$p\in [1,\infty]$. For a matrix $M$, we write $\|M\|_{\max}$ for the
+maximum absolute entry and $\|M\|_\rF$ for the Frobenius norm. We denote
+positive semi-definiteness by $M \succeq 0$ and write $I_d$ for the $d \times
+d$ identity matrix.
+
+For scalar sequences $x_n$ and $y_n$, we write $x_n \lesssim y_n$ if there
+exists a positive constant $C$ such that $|x_n| \leq C |y_n|$ for sufficiently
+large $n$. We write $x_n \asymp y_n$ to indicate both $x_n \lesssim y_n$ and
+$y_n \lesssim x_n$. Similarly, for random variables $X_n$ and $Y_n$, we write
+$X_n \lesssim_\P Y_n$ if for every $\varepsilon > 0$ there exists a positive
+constant $C$ such that $\P(|X_n| \leq C |Y_n|) \leq \varepsilon$, and write
+$X_n \to_\P X$ for limits in probability. For real numbers $a$ and $b$ we use
+$a \vee b = \max\{a,b\}$. We write $\kappa \in \N^d$ for a multi-index, where
+$d \in \N = \{0, 1, 2, \ldots\}$, and define $|\kappa| = \sum_{j=1}^d \kappa_j$
+and $x^\kappa = \prod_{j=1}^d x_j^{\kappa_j}$ for $x \in \R^d$,
+and $\kappa! = \prod_{j=1}^{d} \kappa_j !$.
+
+Since our results concern couplings, some statements must be made on a new or
+enlarged probability space. We omit the details of this for clarity of
+notation, but technicalities are handled by the Vorob'ev--Berkes--Philipp
+Theorem~\citep[Theorem~1.1.10]{dudley1999uniform}.
+
+\section{Main results}
+\label{sec:yurinskii_main_results}
+
+We begin with our most general result: an $\ell^p$-norm Yurinskii coupling of a
+sum of vector-valued approximate martingale differences to a Gaussian
+mixture-distributed random vector. The general result is presented in
+Theorem~\ref{thm:yurinskii_sa_dependent}, while
+Proposition~\ref{pro:yurinskii_sa_simplified} gives
+a simplified and slightly weaker version which is easier to use in
+applications. We then further specialize
+Proposition~\ref{pro:yurinskii_sa_simplified} to
+three scenarios with successively stronger assumptions, namely mixingales,
+martingales, and independent data in
+Corollaries~\ref{cor:yurinskii_sa_mixingale},
+\ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep}
+respectively. In each case we
+allow for possibly random quadratic variations (cf.\ mixing convergence),
+thereby establishing a Gaussian mixture coupling in the general setting. In
+Remark~\ref{rem:yurinskii_coupling_bounds_probability} we comment on and
+correct an often
+overlooked technicality relating to the derivation of bounds in probability
+from Yurinskii's coupling. As a first illustration of the power of our
+generalized $\ell^p$-norm Yurinskii coupling, we present in
+Section~\ref{sec:yurinskii_factor} a simple factor model example relating to
+all three of the aforementioned scenarios.
+
+\begin{theorem}[Strong approximation for vector-valued approximate martingales]
+\label{thm:yurinskii_sa_dependent}
+
+Take a complete probability space with a countably generated filtration
+$\cH_0, \ldots, \cH_n$ for $n \geq 1$, supporting the $\R^d$-valued
+square-integrable variables $X_1, \ldots, X_n$.
+Let $S = \sum_{i=1}^n X_i$ and define
+%
+\begin{align*}
+\tilde X_i
+&= \sum_{r=1}^n \big(\E[X_{r} \mid \cH_{i}] - \E[X_{r} \mid \cH_{i-1}]\big)
+& &\text{and}
+&U &= \sum_{i=1}^{n} \big( X_i - \E[ X_i \mid \cH_n]
++ \E[ X_i \mid \cH_0 ] \big).
+\end{align*}
+%
+Let $V_i = \Var[\tilde X_i \mid \cH_{i-1}]$ and
+define $\Omega = \sum_{i=1}^n V_i - \Sigma$
+where $\Sigma$ is an almost surely positive semi-definite $\cH_0$-measurable
+$d \times d$ matrix. Then, for each $\eta > 0$ and $p \in [1,\infty]$,
+there exists, on an enlarged probability space, an $\R^d$-valued random
+vector $T$ with $T \mid \cH_0 \sim \cN(0, \Sigma)$ and
+%
+\begin{align}
+\label{eq:yurinskii_sa_dependent}
+\P\big(\|S-T\|_p > 6\eta\big)
+&\leq
+\inf_{t>0}
+\left\{
+2 \P\big( \|Z\|_p > t \big)
++ \min\left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\} \nonumber \\
+&\quad+
+\inf_{M \succeq 0}
+\Big\{ 2 \P\big(\Omega \npreceq M\big) + \delta_p(M,\eta)
++ \varepsilon_p(M, \eta)\Big\}
++\P\big(\|U\|_p>\eta\big),
+\end{align}
+%
+where $Z, Z_1,\dots ,Z_n$ are i.i.d.\ standard Gaussian random variables on
+$\R^d$ independent of $\cH_n$, the second infimum is taken over all positive
+semi-definite $d \times d$ non-random matrices $M$,
+%
+\begin{align*}
+\beta_{p,k}
+&=
+\sum_{i=1}^n \E\left[\| \tilde X_i \|^k_2 \| \tilde X_i \|_p
++ \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right],
+&\pi_3
+&=
+\sum_{i=1}^{n}
+\sum_{|\kappa| = 3}
+\E \Big[ \big|
+\E [ \tilde X_i^\kappa \mid \cH_{i-1} ]
+\big| \Big]
+\end{align*}
+%
+for $k \in \{2, 3\}$, with $\pi_3 = \infty$ if the associated
+conditional expectation does not exist, and with
+%
+\begin{align*}
+\delta_p(M,\eta)
+&=
+\P\left(
+\big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
+\geq \eta
+\right), \\
+\varepsilon_p(M, \eta)
+&=
+\P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
+\Omega \preceq M\right).
+\end{align*}
+\end{theorem}
+
+This theorem offers four novel contributions to the literature on coupling
+theory and strong approximation, as discussed in the introduction.
+% approximate martingales
+Firstly \ref{it:yurinskii_contribution_approximate_martingale}, it allows for
+approximate
+vector-valued martingales, with the variables $\tilde X_i$ forming martingale
+differences with respect to $\cH_i$ by construction, and $U$ quantifying the
+associated martingale approximation error. Such martingale approximation
+techniques for sequences of dependent random vectors are well established and
+have been used in a range of scenarios: see, for example,
+\citet{wu2004martingale}, \citet{dedecker2007weak}, \citet{zhao2008martingale},
+\citet{peligrad2010conditional}, \citet{atchade2014martingale},
+\citet{cuny2014martingale}, \citet{magda2018martingale}, and references
+therein. In Section~\ref{sec:yurinskii_mixingales} we demonstrate how this
+approximation
+can be established in practice by restricting our general theorem to the
+special case of mixingales, while the upcoming example in
+Section~\ref{sec:yurinskii_factor} provides an illustration in the context of
+auto-regressive factor models.
+
+% Gaussian mixture
+Secondly \ref{it:yurinskii_contribution_gaussian_mixture},
+Theorem~\ref{thm:yurinskii_sa_dependent} allows for the
+resulting coupling variable $T$
+to follow a multivariate Gaussian distribution only conditionally,
+and thus we offer a useful analog of mixing convergence in the context
+of strong approximation.
+To be more precise, the random matrix $\sum_{i=1}^{n} V_i$
+is the quadratic variation of the constructed martingale
+$\sum_{i=1}^n \tilde X_i$, and we approximate it using the $\cH_0$-measurable
+random matrix $\Sigma$. This yields the coupling variable
+$T \mid \cH_0 \sim \cN(0, \Sigma)$, which can alternatively be written as
+$T=\Sigma^{1/2} Z$ with $Z \sim \cN(0,I_d)$ independent of $\cH_0$.
+The errors in this quadratic variation
+approximation are accounted for by the terms
+$\P(\Omega \npreceq M)$, $\delta_p(M, \eta)$, and $\varepsilon_p(M, \eta)$,
+utilizing a regularization argument through the free matrix parameter $M$.
+If a non-random $\Sigma$ is used, then $T$ is unconditionally Gaussian,
+and one can take $\cH_0$ to be the trivial $\sigma$-algebra.
+As demonstrated in our proof, our approach to establishing a
+mixing approximation is different from naively taking an unconditional version
+of Yurinskii's coupling and applying
+it conditionally on $\cH_0$, which will not deliver the same coupling as in
+Theorem~\ref{thm:yurinskii_sa_dependent} for a few reasons.
+To begin with, we explicitly indicate in the
+conditions of Theorem~\ref{thm:yurinskii_sa_dependent} where conditioning is
+required.
+Next, our error of approximation is given unconditionally,
+involving only marginal expectations and probabilities.
+Finally, we provide a rigorous account of the construction of the
+conditionally Gaussian coupling variable $T$ via a conditional version
+of Strassen's theorem \citep{chen2020jackknife}.
+Section~\ref{sec:yurinskii_martingales}
+illustrates how a strong approximation akin to
+mixing convergence can arise when the data
+forms an exact martingale, and Section~\ref{sec:yurinskii_factor} gives a
+simple example
+relating to factor modeling in statistics and data science.
+
+% remove lower bound on minimum eigenvalue
+As a third contribution to the literature
+\ref{it:yurinskii_contribution_degeneracy}, and
+of particular importance for applications,
+Theorem~\ref{thm:yurinskii_sa_dependent} makes
+no requirements on the minimum eigenvalue of the quadratic variation of the
+approximating martingale sequence. Instead, our proof technique employs a
+careful regularization scheme designed to account for any such exact or
+approximate rank degeneracy in $\Sigma$. This capability is fundamental in some
+applications, a fact which we illustrate in Section \ref{sec:yurinskii_kde} by
+demonstrating the significant improvements in strong approximation errors
+delivered by Theorem~\ref{thm:yurinskii_sa_dependent} relative to those
+obtained using
+prior results in the literature.
+
+% matching third moments
+Finally \ref{it:yurinskii_contribution_third_order},
+Theorem~\ref{thm:yurinskii_sa_dependent} gives
+a third-order strong approximation alongside the usual second-order
+version considered in all prior literature.
+More precisely, we observe that an analog of the term
+$\beta_{p,2}$ is present in the
+classical Yurinskii coupling and comes from a Lindeberg
+telescoping sum argument,
+replacing random variables by Gaussians with the same mean
+and variance to match the first and second moments.
+Whenever the third moments of $\tilde X_i$ are negligible
+(quantified by $\pi_3$), this moment-matching argument can be extended to
+third-order terms, giving a new term $\beta_{p,3}$.
+In certain settings, such as when the data is symmetrically
+distributed around zero, using $\beta_{p,3}$ rather than $\beta_{p,2}$
+can give smaller approximation errors in the coupling given in
+\eqref{eq:yurinskii_sa_dependent}.
+Such a refinement can be viewed as a strong approximation counterpart
+to classical Edgeworth expansion methods.
+We illustrate this phenomenon in our
+upcoming applications to nonparametric inference
+(Section~\ref{sec:yurinskii_nonparametric}).
+
+\subsection{User-friendly formulation of the main result}%
+
+The result in Theorem~\ref{thm:yurinskii_sa_dependent} is given in a somewhat
+implicit
+manner, involving infima over the free parameters $t > 0$ and $M \succeq 0$,
+and it is not clear how to compute these in general. In the upcoming
+Proposition~\ref{pro:yurinskii_sa_simplified}, we set $M = \nu^2 I_d$ and
+approximately
+optimize over $t > 0$ and $\nu > 0$, resulting in a simplified and slightly
+weaker version of our main general result. In specific applications, where
+there is additional knowledge of the quadratic variation structure, other
+choices of regularization schemes may be more appropriate. Nonetheless, the
+choice $M = \nu^2 I_d$ leads to arguably the principal result of our work,
+due to its simplicity and utility in statistical applications. For convenience,
+define the functions $\phi_p : \N \to \R$ for $p \in [0, \infty]$,
+%
+\begin{align*}
+\phi_p(d) =
+\begin{cases}
+\sqrt{pd^{2/p} } & \text{ if } p \in [1,\infty), \\
+\sqrt{2\log 2d} & \text{ if } p =\infty,
+\end{cases}
+\end{align*}
+%
+which are related to tail probabilities
+of the $\ell^p$-norm of a standard Gaussian.
+
+\begin{proposition}[Simplified strong approximation
+for approximate martingales]%
+\label{pro:yurinskii_sa_simplified}
+
+Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}.
+For each $\eta > 0$ and $p \in [1,\infty]$,
+there exists a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++\P\left(\|U\|_p>\frac{\eta}{6}\right).
+\end{align*}
+%
+If further $\pi_3 = 0$ then
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
+\right)^{1/4}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++\P\left(\|U\|_p>\frac{\eta}{6}\right).
+\end{align*}
+%
+\end{proposition}
+
+Proposition~\ref{pro:yurinskii_sa_simplified} makes clear the potential benefit
+of a
+third-order coupling when $\pi_3 = 0$, as in this case the bound features
+$\beta_{p,3}^{1/4}$ rather than $\beta_{p,2}^{1/3}$. If $\pi_3$ is small but
+non-zero, an analogous result can easily be derived by adjusting the optimal
+choices of $t$ and $\nu$, but we omit this for clarity of notation. In
+applications (see Section~\ref{sec:yurinskii_series}), this reduction of the
+exponent can
+provide a significant improvement in terms of the dependence of the bound on
+the sample size $n$, the dimension $d$, and other problem-specific quantities.
+When using our results for strong approximation, it is usual to set
+$p = \infty$ to bound the maximum discrepancy over the entries of a vector (to
+construct uniform confidence sets, for example). In this setting, we have that
+$\phi_\infty(d) = \sqrt{2 \log 2d}$ has a sub-Gaussian slow-growing dependence
+on the dimension. The remaining term depends on $\E[\|\Omega\|_2]$ and requires
+that the matrix $\Sigma$ be a good approximation of $\sum_{i=1}^{n} V_i$, while
+remaining $\cH_0$-measurable. In some applications (such as factor modeling;
+see Section~\ref{sec:yurinskii_factor}), it can be shown that the quadratic
+variation
+$\sum_{i=1}^n V_i$ remains random and $\cH_0$-measurable even in large samples,
+giving a natural choice for $\Sigma$.
+
+In the next few sections, we continue to refine
+Proposition~\ref{pro:yurinskii_sa_simplified}, presenting a sequence of results
+with
+increasingly strict assumptions on the dependence structure of the data $X_i$.
+These allow us to demonstrate the broad applicability of our main results,
+providing more explicit bounds in settings which are likely to be of special
+interest. In particular, we consider mixingales, martingales, and independent
+data, comparing our derived results with those in the existing literature.
+
+\subsection{Mixingales}
+\label{sec:yurinskii_mixingales}
+
+In our first refinement, we provide a natural method for bounding the
+martingale approximation error term $U$. Suppose that $X_i$ form an
+$\ell^p$-mixingale in $L^1(\P)$ in the sense that there exist non-negative
+$c_1, \ldots, c_n$ and $\zeta_0, \ldots, \zeta_n$ such that for all
+$1 \leq i \leq n$ and $0 \leq r \leq i$,
+%
+\begin{align}
+\label{eq:yurinskii_mixingale_1}
+\E \left[ \left\|
+\E \left[ X_i \mid \cH_{i-r} \right]
+\right\|_p \right]
+&\leq
+c_i \zeta_r,
+\end{align}
+%
+and for all $1 \leq i \leq n$ and $0 \leq r \leq n-i$,
+%
+\begin{align}
+\label{eq:yurinskii_mixingale_2}
+\E \left[ \big\|
+X_i - \E \big[ X_i \mid \cH_{i+r} \big]
+\big\|_p \right]
+&\leq
+c_i \zeta_{r+1}.
+\end{align}
+%
+These conditions are satisfied, for example, if $X_i$ are integrable strongly
+$\alpha$-mixing random variables \citep{mcleish1975invariance}, or if $X_i$ are
+generated by an auto-regressive or auto-regressive moving average process (see
+Section~\ref{sec:yurinskii_factor}), among many other possibilities
+\citep{bradley2005basic}. Then, in the notation of
+Theorem~\ref{thm:yurinskii_sa_dependent}, we have by Markov's inequality that
+%
+\begin{align*}
+\P \left( \|U\|_p > \frac{\eta}{6} \right)
+&\leq
+\frac{6}{\eta}
+\sum_{i=1}^{n}
+\E \left[
+\big\|
+X_i - \E \left[ X_i \mid \cH_n \right]
+\big\|_p
++ \big\|
+\E \left[ X_i \mid \cH_0 \right]
+\big\|_p
+\right]
+\leq \frac{\zeta}{\eta},
+\end{align*}
+%
+with $\zeta = 6 \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$.
+Combining Proposition~\ref{pro:yurinskii_sa_simplified} with this
+martingale error bound yields the following result for mixingales.
+%
+\begin{corollary}[Strong approximation for vector-valued mixingales]%
+\label{cor:yurinskii_sa_mixingale}
+
+Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent},
+and suppose
+the mixingale conditions \eqref{eq:yurinskii_mixingale_1} and
+\eqref{eq:yurinskii_mixingale_2} hold. For each $\eta > 0$ and
+$p \in [1,\infty]$ there
+is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++ \frac{\zeta}{\eta}.
+\end{align*}
+%
+If further $\pi_3 = 0$ then
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
+\right)^{1/4}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++ \frac{\zeta}{\eta}.
+\end{align*}
+%
+\end{corollary}
+
+The closest antecedent to Corollary~\ref{cor:yurinskii_sa_mixingale} is found in
+\citet[Theorem~4]{li2020uniform}, who also considered Yurinskii's coupling for
+mixingales. Our result improves on this work in the following manner: it
+removes any requirements on the minimum eigenvalue of the quadratic variation
+of the mixingale sequence; it allows for general $\ell^p$-norms with
+$p\in[1,\infty]$; it establishes a coupling to a multivariate Gaussian
+mixture distribution in general; and it permits third-order couplings
+(when $\pi_3=0$). These improvements have important practical implications as
+demonstrated in Sections \ref{sec:yurinskii_factor} and
+\ref{sec:yurinskii_nonparametric},
+where significantly better coupling approximation
+errors are demonstrated for a variety of statistical applications. On the
+technical side, our result is rigorously established using a conditional
+version of Strassen's theorem \citep{chen2020jackknife}, a carefully crafted
+regularization argument, and a third-order Lindeberg method
+\citep[see][and references therein, for more discussion on the
+standard second-order Lindeberg method]{chatterjee2006generalization}.
+Furthermore, as explained in
+Remark~\ref{rem:yurinskii_coupling_bounds_probability}, we
+clarify a technical issue in \citet{li2020uniform} surrounding the derivation
+of valid probability bounds for $\|S-T\|_p$.
+
+Corollary~\ref{cor:yurinskii_sa_mixingale} focused on mixingales for
+simplicity, but, as
+previously discussed, any method for constructing a martingale approximation
+$\tilde X_i$ and bounding the resulting error $U$ could be used instead in
+Proposition~\ref{pro:yurinskii_sa_simplified} to derive a similar result.
+
+\subsection{Martingales}
+\label{sec:yurinskii_martingales}
+
+For our second refinement, suppose that
+$X_i$ form martingale differences with respect to $\cH_i$.
+In this case, $\E[X_i \mid \cH_n] = X_i$ and $\E[X_i \mid \cH_0] = 0$,
+so $U = 0$, and the martingale approximation error term vanishes.
+Applying Proposition~\ref{pro:yurinskii_sa_simplified} in this setting
+directly yields the following result.
+%
+\begin{corollary}[Strong approximation for vector-valued martingales]%
+\label{cor:yurinskii_sa_martingale}
+
+With the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent},
+suppose that
+$X_i$ is $\cH_i$-measurable satisfying $\E[X_i \mid \cH_{i-1}] = 0$ for
+$1 \leq i \leq n$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there is
+a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
+%
+\begin{align}
+\label{eq:yurinskii_sa_martingale_order_2}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}.
+\end{align}
+%
+If further $\pi_3 = 0$ then
+%
+\begin{align}
+\label{eq:yurinskii_sa_martingale_order_3}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
+\right)^{1/4}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}.
+\end{align}
+%
+\end{corollary}
+
+The closest antecedents to Corollary~\ref{cor:yurinskii_sa_martingale} are
+\citet{belloni2018high} and \citet{li2020uniform}, who also implicitly or
+explicitly considered Yurinskii's coupling for martingales. More specifically,
+\citet[Theorem~1]{li2020uniform} established an explicit
+$\ell^2$-norm Yurinskii coupling
+for martingales under a strong assumption on the minimum eigenvalue of the
+martingale quadratic variation, while \citet[Theorem~2.1]{belloni2018high}
+established a central limit theorem for vector-valued martingale sequences
+employing the standard second-order Lindeberg method, implying that their proof
+could be adapted to deduce a Yurinskii coupling for martingales with the help
+of a conditional version of Strassen's theorem \citep{chen2020jackknife} and
+some additional nontrivial technical work.
+
+Corollary~\ref{cor:yurinskii_sa_martingale} improves over this prior work as
+follows.
+With respect to \citet{li2020uniform}, our result establishes an $\ell^p$-norm
+Gaussian mixture Yurinskii coupling for martingales without any requirements on
+the minimum eigenvalue of the martingale quadratic variation, and permits a
+third-order coupling if $\pi_3=0$. The first probability bound
+\eqref{eq:yurinskii_sa_martingale_order_2} in
+Corollary~\ref{cor:yurinskii_sa_martingale} gives the
+same rate of strong approximation as that in Theorem~1 of \citet{li2020uniform}
+when $p=2$, with non-random $\Sigma$, and when the eigenvalues of a normalized
+version of $\Sigma$ are bounded away from zero. In
+Section~\ref{sec:yurinskii_kde} we
+demonstrate the crucial importance of removing this eigenvalue lower bound
+restriction in applications involving nonparametric kernel estimators, while in
+Section~\ref{sec:yurinskii_series} we demonstrate how the availability of a
+third-order
+coupling \eqref{eq:yurinskii_sa_martingale_order_3} can give improved
+approximation rates
+in applications involving nonparametric series estimators with conditionally
+symmetrically distributed residual errors. Finally, our technical work improves
+on \citet{li2020uniform} in two respects:
+%
+\begin{inlineroman}
+\item
+we employ a conditional version
+of Strassen's theorem (see Lemma~\ref{lem:yurinskii_app_strassen}
+in the appendix)
+to appropriately handle the conditioning arguments; and
+\item
+we deduce valid
+probability bounds for $\|S-T\|_p$, as the following
+Remark~\ref{rem:yurinskii_coupling_bounds_probability} makes clear.
+\end{inlineroman}
+
+\begin{remark}[Yurinskii's coupling and bounds in probability]
+\label{rem:yurinskii_coupling_bounds_probability}
+Given a sequence of random vectors $S_n$, Yurinskii's method provides a
+coupling in the following form: for each $n$ and any $\eta > 0$, there exists
+a random vector $T_n$ with $\P\big(\|S_n - T_n\| > \eta\big) < r_n(\eta)$,
+where $r_n(\eta)$ is the approximation error. Crucially, each coupling
+variable $T_n$ is a function of the desired approximation level $\eta$ and,
+as such, deducing bounds in probability on $\|S_n - T_n\|$ requires some
+extra care. One option is to select a sequence $R_n \to \infty$ and note that
+$\P\big(\|S_n - T_n\| > r_n^{-1}(1 / R_n)\big) < 1 / R_n \to 0$ and hence
+$\|S_n - T_n\| \lesssim_\P r_n^{-1}(1 / R_n)$. In this case, $T_n$ depends on
+the choice of $R_n$, which can in turn typically be chosen to diverge slowly
+enough to cause no issues in applications.
+\end{remark}
+
+Technicalities akin to those outlined in
+Remark~\ref{rem:yurinskii_coupling_bounds_probability} have been both addressed
+and
+neglected alike in the prior literature. \citet[Chapter 10.4, Example
+16]{pollard2002user} apparently misses this subtlety, providing an
+inaccurate bound in probability based on the Yurinskii coupling.
+\citet{li2020uniform} seem to make the same mistake in the proof of their
+Lemma~A2, which invalidates the conclusion of their Theorem~1. In contrast,
+\citet{belloni2015some} and \citet{belloni2019conditional} directly provide
+bounds in $o_\P$ instead of $O_\P$, circumventing these issues in a manner
+similar to our approach involving a diverging sequence $R_n$.
+
+To see how this phenomenon applies to our main results, observe that the
+second-order martingale coupling given as
+\eqref{eq:yurinskii_sa_martingale_order_2} in
+Corollary~\ref{cor:yurinskii_sa_martingale} implies that for any
+$R_n \to \infty$,
+%
+\begin{align*}
+\|S - T\|_p
+\lesssim_\P
+\beta_{p,2}^{1/3}
+\phi_p(d)^{2/3} R_n
++ \E[\|\Omega\|_2]^{1/2}
+\phi_p(d) R_n.
+\end{align*}
+%
+This bound is comparable to that obtained by \citet[Theorem~1]{li2020uniform}
+with $p=2$, albeit with their formulation missing the $R_n$ correction terms.
+In Section~\ref{sec:yurinskii_series} we discuss further their (amended)
+result, in the
+setting of nonparametric series estimation. Our approach using
+$p = \infty$ obtains superior distributional approximation rates, alongside
+exhibiting various other improvements such as the aforementioned third-order
+coupling.
+
+Turning to the comparison with \citet{belloni2018high}, our
+Corollary~\ref{cor:yurinskii_sa_martingale} again offers the same improvements,
+with the
+only exception being that the authors did account for the implications of a
+possibly vanishing minimum eigenvalue. However, their results exclusively
+concern high-dimensional central limit theorems for vector-valued martingales,
+and therefore while their findings
+could in principle enable the derivation of a result similar to our
+Corollary~\ref{cor:yurinskii_sa_martingale}, this would require additional
+technical work
+on their behalf in multiple ways
+(see Appendix~\ref{app:yurinskii}):
+%
+\begin{inlineroman}
+\item a correct application of a conditional
+version of Strassen's theorem
+(Lemma~\ref{lem:yurinskii_app_strassen});
+\item the development of a third-order Borel set smoothing technique and
+associated $\ell^p$-norm moment control
+(Lemmas \ref{lem:yurinskii_app_smooth_approximation},
+\ref{lem:yurinskii_app_gaussian_useful},
+and \ref{lem:yurinskii_app_gaussian_pnorm});
+\item a careful truncation scheme to account for
+$\Omega\npreceq0$; and
+\item a valid third-order Lindeberg argument
+(Lemma \ref{lem:yurinskii_app_sa_martingale}),
+among others.
+\end{inlineroman}
+
+\subsection{Independence}
+
+As a final refinement, suppose that $X_i$ are independent and
+zero-mean conditionally on $\cH_0$,
+and take $\cH_i$ to be the filtration
+generated by $X_1, \ldots, X_i$ and $\cH_0$ for $1 \leq i \leq n$.
+Then, taking $\Sigma = \sum_{i=1}^n V_i$
+gives $\Omega = 0$, and hence Corollary~\ref{cor:yurinskii_sa_martingale}
+immediately yields the following result.
+%
+\begin{corollary}[Strong approximation for sums of independent vectors]%
+\label{cor:yurinskii_sa_indep}
+
+Take the setup of Theorem~\ref{thm:yurinskii_sa_dependent},
+and let $X_i$ be independent given $\cH_0$,
+with $\E[X_i \mid \cH_0] = 0$.
+Then, for each $\eta > 0$ and $p \in [1,\infty]$,
+with $\Sigma = \sum_{i=1}^n V_i$,
+there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
+%
+\begin{align}
+\label{eq:yurinskii_sa_indep_order_2}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}.
+\end{align}
+%
+If further $\pi_3 = 0$ then
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4}.
+\end{align*}
+%
+\end{corollary}
+
+Taking $\cH_0$ to be trivial,
+\eqref{eq:yurinskii_sa_indep_order_2} provides an $\ell^p$-norm approximation
+analogous to that presented in \citet{belloni2019conditional}.
+By further
+restricting to $p=2$, we recover the original Yurinskii coupling as presented
+in \citet[Theorem~1]{lecam1988} and \citet[Theorem~10]{pollard2002user}. Thus,
+in the independent data setting, our result improves on prior work as follows:
+\begin{inlineroman}
+\item
+it establishes a coupling to a multivariate Gaussian mixture distribution;
+and
+\item
+it permits a third-order coupling if $\pi_3=0$.
+\end{inlineroman}
+
+\subsection{Stylized example: factor modeling}
+\label{sec:yurinskii_factor}
+
+In this section, we present a simple statistical example of how our
+improvements over prior coupling results can have important theoretical and
+practical implications. Consider the stylized factor model
+%
+\begin{align*}
+X_i = L f_i + \varepsilon_i, \qquad 1 \leq i \leq n,
+\end{align*}
+%
+with random variables $L$ taking values in $\R^{d \times m}$, $f_i$ in $\R^m$,
+and $\varepsilon_i$ in $\R^d$. We interpret $f_i$ as a latent factor variable
+and $L$ as a random factor loading, with idiosyncratic disturbances
+$\varepsilon_i$. See \citet{fan2020statistical}, and references therein, for a
+textbook review of factor analysis in statistics and econometrics.
+
+We employ the above factor model to give a first illustration of the
+applicability of our main result Theorem~\ref{thm:yurinskii_sa_dependent}, the
+user-friendly Proposition~\ref{pro:yurinskii_sa_simplified}, and their
+specialized
+Corollaries~\ref{cor:yurinskii_sa_mixingale}--\ref{cor:yurinskii_sa_indep}. We
+consider three different sets of conditions to demonstrate the applicability of
+each of our corollaries for mixingales, martingales, and independent data,
+respectively. We assume throughout that
+$(\varepsilon_1, \ldots, \varepsilon_n)$ is zero-mean and finite variance, and
+that $(\varepsilon_1, \ldots, \varepsilon_n)$ is independent
+of $L$ and $(f_1, \ldots, f_n)$. Let $\cH_i$ be the $\sigma$-algebra generated
+by $L$, $(f_1, \ldots, f_i)$, and $(\varepsilon_1, \ldots, \varepsilon_i)$, with
+$\cH_0$ the $\sigma$-algebra generated by $L$ alone.
+
+\begin{itemize}
+\item \emph{Independent data}.
+Suppose that the factors $(f_1, \ldots,
+f_n)$ are independent conditional on $L$ and satisfy
+$\E [ f_i \mid L ] = 0$.
+Then, since $X_i$ are independent conditional on $\cH_0$ and with
+$\E [ X_i \mid \cH_0 ] = \E [ L f_i + \varepsilon_i \mid L ] = 0$,
+we can apply Corollary~\ref{cor:yurinskii_sa_indep} to $\sum_{i=1}^n X_i$.
+In general, we will obtain a coupling variable which has the Gaussian
+mixture distribution $T \mid \cH_0 \sim \cN(0, \Sigma)$ where
+$\Sigma= \sum_{i=1}^n (L\Var[f_i \mid L]L^\T +\Var[\varepsilon_i])$.
+In the special case where $L$ is non-random
+and $\cH_0$ is trivial, the coupling is Gaussian. Further,
+if $f_i\mid L$ and $\varepsilon_i$ are symmetric about zero
+and bounded, then $\pi_3=0$, and the coupling is improved.
+
+\item \emph{Martingales}.
+Suppose instead that we assume only a martingale
+condition on the latent factor variables so that
+$\E \left[ f_i \mid L, f_1, \ldots, f_{i-1} \right] = 0$.
+Then $\E [ X_i \mid \cH_{i-1} ]
+= L\, \E \left[ f_i \mid \cH_{i-1} \right] = 0$
+and Corollary~\ref{cor:yurinskii_sa_martingale} is applicable to
+$\sum_{i=1}^n X_i$.
+The preceding comments on Gaussian mixture distributions
+and third-order couplings continue to apply.
+
+\item \emph{Mixingales}.
+Finally, assume that the factors follow the
+auto-regressive model $f_i = A f_{i-1} + u_i$ where
+$A \in \R^{m \times m}$ is non-random and $(u_1, \ldots, u_n)$ are
+zero-mean, independent, and independent of
+$(\varepsilon_1, \ldots, \varepsilon_n)$.
+Then $\E \left[ f_i \mid f_0 \right] = A^i f_0$, so taking
+$p \in [1, \infty]$ we see that
+$\E \big[ \| \E [ f_i \mid f_0 ] \|_p \big]
+= \E \big[ \| A^i f_0 \|_p \big] \leq \|A\|_p^i\,\E [ \|f_0\|_p ]$,
+and that clearly $f_i - \E [ f_i \mid \cH_n ] = 0$.
+Thus, whenever $\|A\|_p < 1$, the geometric sum formula implies that
+we can apply the mixingale result from
+Corollary~\ref{cor:yurinskii_sa_mixingale} to
+$\sum_{i=1}^n X_i$. The conclusions on Gaussian mixture distributions
+and third-order couplings parallel the previous cases.
+%
+\end{itemize}
+
+This simple application to factor modeling gives a preliminary illustration of
+the power of our main results, encompassing settings which could not be handled
+by employing Yurinskii couplings available in the existing literature. Even
+with independent data, we offer new Yurinskii couplings to Gaussian mixture
+distributions (due to the presence of the common random factor loading $L$),
+which could be further improved whenever the factors and residuals possess
+symmetric (conditional) distributions. Furthermore, our results do not impose
+any restrictions on the minimum eigenvalue of $\Sigma$, thereby allowing for
+more general factor structures. These improvements are maintained in the
+martingale, mixingale, and weakly dependent stationary data settings.
+
+\section{Strong approximation for martingale empirical processes}%
+\label{sec:yurinskii_emp_proc}
+
+In this section, we demonstrate how our main results can be applied to some more
+substantive problems in statistics. Having until this point studied only
+finite-dimensional (albeit potentially high-dimensional) random vectors, we now
+turn our attention to infinite-dimensional stochastic processes. Specifically,
+we consider empirical processes of the form
+$S(f) = \sum_{i=1}^{n} f(X_i)$ for $f \in \cF$
+a problem-specific class of real-valued
+functions, where each $f(X_i)$ forms a martingale difference sequence with
+respect to an appropriate filtration. We construct (conditionally) Gaussian
+processes $T(f)$ for which an upper bound on the uniform coupling error
+$\sup_{f \in \cF} |S(f) - T(f)|$ is precisely quantified. We control the
+complexity of $\cF$ using metric entropy under Orlicz norms.
+
+The novel strong approximation results which we present concern the entire
+martingale empirical process $(S(f):f \in \cF)$, as opposed to just the scalar
+supremum of the empirical process, $\sup_{f \in \cF} |S(f)|$. This distinction
+has been carefully noted by \citet{chernozhukov2014gaussian}, who studied
+Gaussian approximation of empirical process suprema in the independent data
+setting and wrote (p.\ $1565$): ``A related but different problem is that of
+approximating \textit{whole} empirical processes by a sequence of Gaussian
+processes in the sup-norm. This problem is more difficult than
+[approximating the supremum of the empirical process].''
+Indeed, the results we establish in
+this section are for a strong approximation for the entire empirical process by
+a sequence of Gaussian mixture processes in the supremum norm, when the data
+has a martingale difference structure
+(cf.\ Corollary \ref{cor:yurinskii_sa_martingale}).
+Our results can be further generalized to approximate martingale
+empirical processes (cf.\ Corollary \ref{cor:yurinskii_sa_mixingale}), but we
+do not
+consider this extension to reduce notation and the technical burden.
+
+\subsection{Motivating example: kernel density estimation}
+\label{sec:yurinskii_kde}
+
+We begin with a brief study of a canonical example of an empirical process
+which is non-Donsker (thus precluding the use of uniform central limit
+theorems) due to the presence of a function class whose complexity increases
+with the sample size: the kernel density estimator with i.i.d.\ scalar data.
+We give an overview of our general strategy for
+strong approximation of stochastic processes
+via discretization, and show explicitly in
+Lemma~\ref{lem:yurinskii_kde_eigenvalue}
+how it is crucial
+that we do not impose lower bounds on the eigenvalues of the discretized
+covariance matrix. Detailed calculations for this section are
+relegated to Appendix~\ref{app:yurinskii} for conciseness.
+
+Let $X_1, \ldots, X_n$ be i.i.d.\ $\Unif[0,1]$, take
+$K(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$ the Gaussian kernel and let
+$h \in (0,1]$ be a bandwidth. Then, for $a \in (0,1/4]$ and
+$x \in \cX = [a, 1-a]$ to avoid boundary issues, the kernel density estimator
+of the true density function $g(x) = 1$ is
+%
+\begin{align*}
+\hat g(x)
+&=
+\frac{1}{n}
+\sum_{i=1}^{n}
+K_h( X_i - x),
+\qquad K_h(u) = \frac{1}{h} K\left( \frac{u}{h} \right).
+\end{align*}
+%
+Consider establishing a strong approximation for the stochastic process
+$(\hat g(x)-\E [ \hat g(x) ] : x\in\cX)$
+which is, upon rescaling, non-Donsker whenever
+the bandwidth decreases to zero in large samples.
+To match notation with the upcoming
+general result for empirical processes, set
+$f_x(u) = \frac{1}{n} (K_h( u - x) - \E[K_h( X_i - x)])$
+so $S(x) \vcentcolon= S(f_x) = \hat g(x)-\E [ \hat g(x) ]$.
+The next step is standard: a
+mesh separates the local oscillations of the processes from
+the finite-dimensional coupling.
+For $\delta \in (0,1/2)$, set
+$N = \left\lfloor 1 + \frac{1 - 2a}{\delta} \right\rfloor$
+and $\cX_\delta = (a + (j-1)\delta : 1 \leq j \leq N)$.
+Letting $T(x)$ be the approximating stochastic
+process to be constructed, consider the decomposition
+%
+\begin{align*}
+\sup_{x \in \cX}
+\big|S(x) - T(x)\big|
+&\leq
+\sup_{|x-x'| \leq \delta}
+\big|S(x) - S(x') \big|
++ \max_{x \in \cX_\delta}
+|S(x) - T(x)|
++ \sup_{|x-x'| \leq \delta}
+\big|T(x) - T(x')\big|.
+\end{align*}
+%
+Writing $S(\cX_\delta)$ for
+$\big(S(x) : x \in \cX_\delta\big)\in \mathbb{R}^N$,
+noting that this is a sum of i.i.d.\ random vectors, we apply
+Corollary~\ref{cor:yurinskii_sa_indep} as
+$\max_{x \in \cX_\delta} |S(x) - T(x)|
+= \| S(\cX_\delta) - T(\cX_\delta) \|_\infty$.
+We obtain that for each $\eta > 0$ there is a Gaussian vector
+$T(\cX_\delta)$ with the same covariance matrix as $S(\cX_\delta)$ satisfying
+%
+\begin{align*}
+\P\left(
+\|S(\cX_\delta) - T(\cX_\delta)\|_\infty > \eta
+\right)
+&\leq
+31 \left(
+\frac{N \log 2 N}{\eta^3 n^2 h^2}
+\right)^{1/3}
+\end{align*}
+%
+assuming that $1/h \geq \log 2 N$.
+By the Vorob'ev--Berkes--Philipp theorem
+\citep[Theorem~1.1.10]{dudley1999uniform},
+$T(\cX_\delta)$ extends to a Gaussian process $T(x)$
+defined for all $x \in \cX$ and with the same covariance structure
+as $S(x)$.
+
+Next, chaining with the Bernstein--Orlicz and sub-Gaussian norms
+\citep[Section~2.2]{van1996weak} shows that if
+$\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$,
+%
+\begin{align*}
+\sup_{|x-x'| \leq \delta}
+\big\|S(x) - S(x') \big\|_\infty
+&\lesssim_\P
+\delta
+\sqrt{\frac{\log n}{n h^3}} \ \quad\text{and}\quad
+\sup_{|x-x'| \leq \delta}
+\big\|T(x) - T(x')\big\|_\infty
+\lesssim_\P
+\delta
+\sqrt{\frac{\log n}{n h^3}}.
+\end{align*}
+%
+Finally, for any $R_n\to\infty$
+(see Remark~\ref{rem:yurinskii_coupling_bounds_probability}),
+the resulting bound on the coupling error is
+%
+\begin{align*}
+\sup_{x \in \cX}
+\big| S(x) - T(x) \big|
+&\lesssim_\P
+\left( \frac{N \log 2N}{n^2 h^2} \right)^{1/3} R_n
++ \delta \sqrt{\frac{\log n}{n h^3}},
+\end{align*}
+%
+where the mesh size $\delta$ can then be approximately
+optimized to obtain the tightest possible strong approximation.
+
+The discretization strategy outlined above is at the core of the proof strategy
+for our upcoming Proposition~\ref{pro:yurinskii_emp_proc}. Since we will
+consider
+martingale empirical processes, our proof will rely on
+Corollary~\ref{cor:yurinskii_sa_martingale}, which, unlike the martingale
+Yurinskii
+coupling established by \citet{li2020uniform}, does not require a lower bound
+on the minimum eigenvalue of $\Sigma$. Using the simple kernel density example
+just discussed, we now demonstrate precisely the crucial importance of removing
+such eigenvalue conditions. The following
+Lemma~\ref{lem:yurinskii_kde_eigenvalue} shows
+that the discretized covariance matrix $\Sigma = n h\Var[S(\cX_\delta)]$ has
+exponentially small eigenvalues, which in turn will negatively affect the
+strong approximation bound if the \citet{li2020uniform} coupling were to be
+used instead of the results in this dissertation.
+
+\begin{lemma}[Minimum eigenvalue of a
+kernel density estimator covariance matrix]%
+\label{lem:yurinskii_kde_eigenvalue}
+%
+The minimum eigenvalue of
+$\Sigma=n h\Var[S(\cX_\delta)] \in \R^{N \times N}$
+satisfies the upper bound
+%
+\begin{align*}
+\lambda_{\min}(\Sigma)
+&\leq
+2 e^{-h^2/\delta^2}
++ \frac{h}{\pi a \delta}
+e^{-a^2 / h^2}.
+\end{align*}
+\end{lemma}
+%
+Figure~\ref{fig:yurinskii_min_eig} shows how the upper bound in Lemma
+\ref{lem:yurinskii_kde_eigenvalue} captures the behavior of the simulated
+minimum
+eigenvalue of $\Sigma$. In particular, the smallest eigenvalue decays
+exponentially fast in the discretization level $\delta$ and the bandwidth $h$.
+As seen in the calculations above, the coupling rate depends on $\delta / h$,
+while the bias will generally depend on $h$, implying that both $\delta$ and
+$h$ must converge to zero to ensure valid statistical inference. In general,
+this will lead to $\Sigma$ possessing extremely small eigenvalues, rendering
+strong approximation approaches such as that of \citet{li2020uniform}
+ineffective in such scenarios.
+%
+\begin{figure}[t]
+\centering
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/sim_2.pdf}
+\caption{$h = 0.03$}
+\end{subfigure}
+\begin{subfigure}{0.49\textwidth}
+\centering
+\includegraphics[scale=0.64]{graphics/sim_1.pdf}
+\caption{$h = 0.01$}
+\end{subfigure}
+\caption[Minimum eigenvalue of the kernel density covariance matrix]{
+Upper bounds on the minimum eigenvalue of the discretized covariance
+matrix in kernel density estimation,
+with $n=100$ and $a = 0.2$.
+Simulated: the kernel density estimator is simulated,
+resampling the data $100$ times
+to estimate its covariance.
+Computing matrix: the minimum eigenvalue of the limiting covariance
+matrix $\Sigma$ is computed explicitly.
+Upper bound: the bound derived in
+Lemma~\ref{lem:yurinskii_kde_eigenvalue}
+is shown.
+}
+\label{fig:yurinskii_min_eig}
+\end{figure}
+
+The discussion in this section focuses on the strong approximation of the
+centered process $\hat g(x)-\E [ \hat g(x) ]$. In practice, the goal is often
+rather to approximate the feasible process $\hat g(x)- g(x)$. The difference
+between these is captured by the smoothing bias $\E [ \hat g(x) ] - g(x)$,
+which is straightforward to control in this case with
+$\sup_{x \in \cX} \big| \E [ \hat g(x) ] - g(x) \big|
+\lesssim \frac{h}{a} e^{-a^2 / (2 h^2)}$.
+See Section \ref{sec:yurinskii_nonparametric} for further
+comments.
+
+\subsection{General result for martingale empirical processes}
+
+We now give our general result on a strong approximation for
+martingale empirical processes, obtained by applying
+the first result \eqref{eq:yurinskii_sa_martingale_order_2} in
+Corollary~\ref{cor:yurinskii_sa_martingale} with $p=\infty$
+to a discretization of the empirical process,
+as in Section~\ref{sec:yurinskii_kde}.
+We then control the increments in the stochastic processes
+using chaining with Orlicz norms,
+but note that other tools are available,
+including generalized entropy with bracketing \citep{geer2000empirical}
+and sequential symmetrization \citep{rakhlin2015sequential}.
+
+A class of functions is said to be \emph{pointwise measurable}
+if it contains a countable subclass which is dense under
+the pointwise convergence topology.
+For a finite class $\cF$, write
+$\cF(x) = \big(f(x) : f \in \cF\big)$.
+Define the set of Orlicz functions
+%
+\begin{align*}
+\Psi
+&=
+\left\{
+\psi: [0, \infty) \to [0, \infty)
+\text{ convex increasing, }
+\psi(0) = 0,\
+\limsup_{x,y \to \infty} \tfrac{\psi(x) \psi(y)}{\psi(C x y)} < \infty
+\text{ for } C > 0
+\right\}
+\end{align*}
+%
+and, for real-valued $Y$, the Orlicz norm
+$\vvvert Y \vvvert_\psi
+= \inf
+\left\{ C > 0:
+\E \left[ \psi(|Y|/C) \leq 1 \right]
+\right\}$
+as in \citet[Section~2.2]{van1996weak}.
+
+\begin{proposition}[Strong approximation for martingale empirical processes]%
+\label{pro:yurinskii_emp_proc}
+
+Let $X_i$ be random variables for $1 \leq i \leq n$ taking values in a
+measurable space $\cX$, and $\cF$ be a pointwise measurable class of
+functions from $\cX$ to $\R$. Let $\cH_0, \ldots, \cH_n$ be a filtration such
+that each $X_i$ is $\cH_i$-measurable, with $\cH_0$ the trivial
+$\sigma$-algebra, and suppose that $\E[f(X_i) \mid \cH_{i-1}] = 0$ for all
+$f \in \cF$. Define $S(f) = \sum_{i=1}^n f(X_i)$ for $f\in\cF$ and let
+$\Sigma: \cF \times \cF \to \R$ be an almost surely positive semi-definite
+$\cH_0$-measurable random function. Suppose that for a non-random
+metric $d$ on $\cF$, constant $L$, and $\psi \in \Psi$,
+%
+\begin{align}%
+\label{eq:yurinskii_emp_proc_var}
+\Sigma(f,f) - 2\Sigma(f,f') + \Sigma(f',f')
++ \bigvvvert S(f) - S(f') \bigvvvert_\psi^2
+&\leq L^2 d(f,f')^2 \quad \text{a.s.}
+\end{align}
+%
+Then for each $\eta > 0$ there is a process $T(f)$
+which, conditional on $\cH_0$, is zero-mean and Gaussian,
+satisfying $\E\big[ T(f) T(f') \mid \cH_0 \big] = \Sigma(f,f')$
+for all $f, f' \in \cF$, and for all $t > 0$ has
+%
+\begin{align*}
+&\P\left(
+\sup_{f \in \cF}
+\big| S(f) - T(f) \big|
+\geq C_\psi(t + \eta)
+\right)
+\leq
+C_\psi
+\inf_{\delta > 0}
+\inf_{\cF_\delta}
+\Bigg\{
+\frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } \\
+&\qquad\quad+
+\left(\frac{\sqrt{\log 2 |\cF_\delta|}
+\sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3}
++ \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1}
++ \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right)
+\Bigg\}
+\end{align*}
+%
+where $\cF_\delta$ is any finite $\delta$-cover of $(\cF,d)$
+and $C_\psi$ is a constant depending only on $\psi$, with
+%
+\begin{align*}
+\beta_\delta
+&= \sum_{i=1}^n
+\E\left[ \|\cF_\delta(X_i)\|^2_2\|\cF_\delta(X_i)\|_\infty
++ \|V_i(\cF_\delta)^{1/2}Z_i\|^2_2
+\|V_i(\cF_\delta)^{1/2}Z_i\|_\infty \right], \\
+V_i(\cF_\delta)
+&=
+\E\big[\cF_\delta(X_i) \cF_\delta(X_i)^\T \mid \cH_{i-1} \big],
+\hspace*{27.7mm}
+\Omega_\delta
+=
+\sum_{i=1}^n V_i(\cF_\delta) - \Sigma(\cF_\delta), \\
+J_\psi(\delta)
+&=
+\int_0^\delta \psi^{-1}\big( N_\varepsilon \big)
+\diff{\varepsilon}
++ \delta \psi^{-1} \big( N_\delta^2 \big),
+\hspace*{19mm}
+J_2(\delta)
+= \int_0^\delta \sqrt{\log N_\varepsilon}
+\diff{\varepsilon},
+\end{align*}
+%
+where $N_\delta = N(\delta, \cF, d)$
+is the $\delta$-covering number of $(\cF, d)$
+and $Z_i$ are i.i.d.\ $\cN\big(0, I_{|\cF_\delta|}\big)$
+independent of $\cH_n$.
+If $\cF_\delta$ is a minimal $\delta$-cover
+of $(\cF, d)$, then $|\cF_\delta| = N_\delta$.
+\end{proposition}
+
+Proposition~\ref{pro:yurinskii_emp_proc}
+is given in a rather general form to accommodate a range of different
+settings and applications.
+In particular, consider the following well-known Orlicz functions.
+%
+\begin{description}
+
+\item[Polynomial:]
+$\psi(x) = x^a$ for $a \geq 2$
+has $\vvvert X \vvvert_2 \leq \vvvert X \vvvert_\psi$ and
+$\sqrt{\log x} \leq \sqrt{a} \psi^{-1}(x)$.
+
+\item[Exponential:]
+$\psi(x) = \exp(x^a) - 1$ for $a \in [1,2]$
+has $\vvvert X \vvvert_2 \leq 2\vvvert X \vvvert_\psi$ and
+$\sqrt{\log x} \leq \psi^{-1}(x)$.
+
+\item[Bernstein:]
+$\psi(x) = \exp
+\Big(
+\Big(\frac{\sqrt{1+2ax}-1}{a}\Big)^{2}
+\Big)-1$
+for $a > 0$ has
+$\vvvert X \vvvert_2 \leq (1+a)\vvvert X \vvvert_\psi$ \\ and
+$\sqrt{\log x}~\leq~\psi^{-1}(x)$.
+
+\end{description}
+%
+For these Orlicz functions and when $\Sigma(f, f') = \Cov[S(f), S(f')]$ is
+non-random, the terms involving $\Sigma$ in \eqref{eq:yurinskii_emp_proc_var}
+can be
+controlled by the Orlicz $\psi$-norm term; similarly, $J_2$ is bounded by
+$J_\psi$. Further, $C_\psi$ can be replaced by a universal constant $C$ which
+does not depend on the parameter $a$. See Section~2.2 in \citet{van1996weak}
+for details. If the conditional third moments of $f(X_i)$ given $\cH_{i-1}$ are
+all zero (if $f$ and $X_i$ are appropriately symmetric, for example), then the
+second inequality in Corollary~\ref{cor:yurinskii_sa_martingale} can be applied
+to obtain
+a tighter coupling inequality; the details of this are omitted for brevity, and
+the proof would proceed in exactly the same manner.
+
+In general, however, Proposition~\ref{pro:yurinskii_emp_proc} allows for a
+random
+covariance function, yielding a coupling to a stochastic process that is
+Gaussian only conditionally. Such a process can equivalently be viewed as a
+mixture of Gaussian processes, writing $T=\Sigma^{1/2} Z$ with an operator
+square root and where $Z$ is a Gaussian white noise on $\cF$ independent of
+$\cH_0$. This extension is in contrast with much of the existing strong
+approximation and empirical process literature, which tends to focus on
+couplings and weak convergence results with marginally Gaussian processes
+\citep{settati2009gaussian,chernozhukov2016empirical}.
+
+A similar approach was taken by \citet{berthet2006revisiting}, who used a
+Gaussian coupling due to \citet{zaitsev1987estimates,zaitsev1987gaussian} along
+with a discretization method to obtain strong approximations for empirical
+processes with independent data. They handled fluctuations in the stochastic
+processes with uniform $L^2$ covering numbers and bracketing numbers where we
+opt instead for chaining with Orlicz norms. Our version using the martingale
+Yurinskii coupling can improve upon theirs in approximation rate even for
+independent data in certain circumstances. Suppose the setup of
+Proposition~1 in \citet{berthet2006revisiting}; that is, $X_1, \ldots, X_n$ are
+i.i.d.\ and $\sup_{\cF} \|f\|_\infty \leq M$, with the VC-type assumption
+$\sup_\Q N(\varepsilon, \cF, d_\Q) \leq c_0 \varepsilon^{-\nu_0}$ where
+$d_\Q(f,f')^2 = \E_\Q\big[(f-f')^2\big]$ for a measure $\Q$ on $\cX$ and
+$M, c_0, \nu_0$ are constants. Using uniform $L^2$ covering numbers
+rather than Orlicz chaining in our Proposition~4 gives the following.
+Firstly, as $X_i$ are i.i.d., take $\Sigma(f, f') = \Cov[S(f), S(f')]$ so
+$\Omega_\delta = 0$. Let $\cF_\delta$ be a minimal $\delta$-cover of
+$(\cF, d_\P)$ with cardinality $N_\delta \lesssim \delta^{-\nu_0}$ where
+$\delta \to 0$. It is easy to show that
+$\beta_\delta \lesssim n \delta^{-\nu_0} \sqrt{\log(1/\delta)}$.
+Theorem~2.2.8 and Theorem~2.14.1 in \citet{van1996weak} then give
+%
+\begin{align*}
+\E\left[
+\sup_{d_\P(f,f') \leq \delta}
+\Big(
+|S(f) - S(f')|
++ |T(f) - T(f')|
+\Big)
+\right]
+&\lesssim
+\sup_\Q
+\int_0^\delta
+\sqrt{n \log N(\varepsilon, \cF, d_\Q)}
+\diff{\varepsilon} \\
+&\lesssim
+\delta \sqrt{n\log(1/\delta)},
+\end{align*}
+%
+where we used the VC-type property to bound the entropy integral.
+So by our Proposition~\ref{pro:yurinskii_emp_proc},
+for any sequence $R_n \to \infty$
+(see Remark~\ref{rem:yurinskii_coupling_bounds_probability}),
+%
+\begin{align*}
+\sup_{f \in \cF}
+\big| S(f) - T(f) \big|
+&\lesssim_\P
+n^{1/3} \delta^{-\nu_0/3}
+\sqrt{\log(1/\delta)} R_n
++ \delta \sqrt{n\log(1/\delta)}
+\lesssim_\P
+n^{\frac{2+\nu_0}{6+2\nu_0}}
+\sqrt{\log n} R_n,
+\end{align*}
+%
+where we minimized over $\delta$ in the last step.
+\citet[Proposition~1]{berthet2006revisiting} achieved
+%
+\begin{align*}
+\sup_{f \in \cF}
+\big| S(f) - T(f) \big|
+&\lesssim_\P
+n^{\frac{5\nu_0}{4+10\nu_0}}
+(\log n)^{\frac{4+5\nu_0}{4+10\nu_0}},
+\end{align*}
+%
+showing that our approach achieves a better approximation rate whenever
+$\nu_0 > 4/3$. In particular, our method is superior in richer function classes
+with larger VC-type dimension. For example, if $\cF$ is smoothly parameterized
+by $\theta \in \Theta \subseteq \R^d$ where $\Theta$ contains an open set, then
+$\nu_0 > 4/3$ corresponds to $d \geq 2$ and our rate is better as soon as the
+parameter space is more than one-dimensional. The difference in approximation
+rate is due to Zaitsev's coupling having better dependence on the sample size
+but worse dependence on the dimension. In particular, Zaitsev's coupling is
+stated only in $\ell^2$-norm and hence
+\citet[Equation~5.3]{berthet2006revisiting} are compelled to use the inequality
+$\|\cdot\|_\infty \leq \|\cdot\|_2$ in the coupling step, a bound which is
+loose when the dimension of the vectors (here on the order of
+$\delta^{-\nu_0}$) is even moderately large. We use the fact that our version
+of Yurinskii's coupling applies directly to the supremum norm, giving sharper
+dependence on the dimension.
+
+In Section~\ref{sec:yurinskii_local_poly} we apply
+Proposition~\ref{pro:yurinskii_emp_proc} to
+obtain strong approximations for local polynomial estimators in the
+nonparametric regression setting. In contrast with the series estimators of the
+upcoming Section~\ref{sec:yurinskii_series}, local polynomial estimators are
+not linearly
+separable and hence cannot be analyzed directly using the finite-dimensional
+Corollary~\ref{cor:yurinskii_sa_martingale}.
+
+\section{Applications to nonparametric regression}
+\label{sec:yurinskii_nonparametric}
+
+We illustrate the applicability of our previous strong approximation results
+with two substantial and classical examples in nonparametric regression
+estimation. Firstly, we present an analysis of partitioning-based series
+estimators, where we can apply Corollary~\ref{cor:yurinskii_sa_martingale}
+directly due to an intrinsic linear separability property. Secondly, we
+consider local polynomial estimators, this time using
+Proposition~\ref{pro:yurinskii_emp_proc} due to a non-linearly separable
+martingale empirical process.
+
+\subsection{Partitioning-based series estimators}
+\label{sec:yurinskii_series}
+
+Partitioning-based least squares methods are essential tools for estimation and
+inference in nonparametric regression, encompassing splines, piecewise
+polynomials, compactly supported wavelets and decision trees as special cases.
+See \citet{cattaneo2020large} for further details and references throughout
+this section. We illustrate the usefulness of
+Corollary~\ref{cor:yurinskii_sa_martingale}
+by deriving a Gaussian strong approximation for partitioning series estimators
+based on multivariate martingale data. Proposition~\ref{pro:yurinskii_series}
+shows how
+we achieve the best known rate of strong approximation for independent data by
+imposing an additional mild $\alpha$-mixing condition to control the time
+series dependence of the regressors.
+
+Consider the nonparametric regression setup with martingale difference
+residuals defined by $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$
+where the regressors $W_i$ have compact connected support $\cW \subseteq \R^m$,
+$\cH_i$ is the $\sigma$-algebra generated by
+$(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$,
+$\E[\varepsilon_i \mid \cH_{i-1}] = 0$ and $\mu: \cW \to \R$ is the estimand.
+Let $p(w)$ be a $k$-dimensional vector of bounded basis functions on $\cW$
+which are locally supported on a quasi-uniform partition
+\citep[Assumption~2]{cattaneo2020large}. Under minimal regularity conditions,
+the least-squares partitioning-based series estimator is
+$\hat\mu(w) = p(w)^{\T} \hat H^{-1} \sum_{i=1}^n p(W_i) Y_i$
+with $\hat H = \sum_{i=1}^n p(W_i) p(W_i)^\T$.
+The approximation power of the estimator $\hat\mu(w)$ derives from letting
+$k\to\infty$ as $n\to\infty$. The assumptions made on $p(w)$ are mild enough to
+accommodate splines, wavelets, piecewise polynomials, and certain types of
+decision trees. For such a tree, $p(w)$ is comprised of indicator functions
+over $k$ axis-aligned rectangles forming a partition of $\cW$ (a Haar basis),
+provided that the partitions are constructed using independent data
+(e.g., with sample splitting).
+
+Our goal is to approximate the law of the stochastic process
+$(\hat\mu(w)-\mu(w):w\in\cW)$, which upon rescaling is typically not
+asymptotically tight as $k \to \infty$ and thus does not converge weakly.
+Nevertheless, exploiting the intrinsic linearity of the estimator $\hat\mu(w)$,
+we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly to construct
+a Gaussian
+strong approximation. Specifically, we write
+%
+\begin{equation*}
+\hat\mu(w) - \mu(w)
+= p(w)^\T H^{-1} S
++ p(w)^\T \big(\hat H^{-1} - H^{-1}\big) S
++ \Bias(w),
+\end{equation*}
+%
+where $H= \sum_{i=1}^n \E\left[p(W_i) p(W_i)^\T\right]$
+is the expected outer product matrix, $S = \sum_{i=1}^n p(W_i) \varepsilon_i$
+is the score vector, and
+$\Bias(w) = p(w)^{\T} \hat H^{-1}\sum_{i=1}^n p(W_i) \mu(W_i) - \mu(w)$.
+Imposing some mild time series restrictions and assuming stationarity,
+it is not difficult to show
+(see Section~\ref{sec:yurinskii_app_proofs})
+that $\|\hat H - H\|_1 \lesssim_\P \sqrt{n k}$ and
+$\sup_{w\in\cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$
+for some $\gamma>0$, depending on the specific structure of the basis
+functions, the dimension $m$ of the regressors, and the smoothness of the
+regression function $\mu$. It remains to study the $k$-dimensional
+mean-zero martingale $S$ by applying
+Corollary~\ref{cor:yurinskii_sa_martingale} with
+$X_i=p(W_i) \varepsilon_i$. Controlling the convergence of the quadratic
+variation term $\E[\|\Omega\|_2]$ requires some time series dependence
+assumptions; we impose an $\alpha$-mixing condition on $(W_1, \ldots, W_n)$ for
+illustration \citep{bradley2005basic}.
+
+\begin{proposition}[Strong approximation for partitioning series estimators]%
+\label{pro:yurinskii_series}
+%
+Consider the nonparametric regression setup described above
+and further assume the following:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+$(W_i, \varepsilon_i)_{1 \leq i \leq n}$
+is strictly stationary.
+
+\item
+$W_1, \ldots, W_n$ is $\alpha$-mixing with mixing coefficients
+satisfying $\sum_{j=1}^\infty \alpha(j) < \infty$.
+
+\item
+$W_i$ has a Lebesgue density on $\cW$
+which is bounded above and away from zero.
+
+\item
+$\E\big[|\varepsilon_i|^3 \big] < \infty$
+and
+$\E\big[\varepsilon_i^2 \mid \cH_{i-1}\big]=\sigma^2(W_i)$
+is bounded away from zero.
+
+\item
+$p(w)$ is a basis with $k$ features satisfying
+Assumptions~2 and~3 in \citet{cattaneo2020large}.
+
+\end{enumerate}
+%
+Then, for any sequence $R_n \to \infty$,
+there is a zero-mean Gaussian process
+$G(w)$ indexed on $\cW$
+with $\Var[G(w)] \asymp\frac{k}{n}$
+satisfying
+$\Cov[G(w), G(w')]
+= \Cov[p(w)^\T H^{-1} S,\, p(w')^\T H^{-1} S]$
+and
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \hat\mu(w) - \mu(w) - G(w) \right|
+&\lesssim_\P
+\sqrt{\frac{k}{n}}
+\left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} R_n
++ \sup_{w \in \cW} |\Bias(w)|
+\end{align*}
+%
+assuming the number of basis functions satisfies $k^3 / n \to 0$.
+If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \hat\mu(w) - \mu(w) - G(w) \right|
+&\lesssim_\P
+\sqrt{\frac{k}{n}}
+\left( \frac{k^3 (\log k)^2}{n} \right)^{1/4} R_n
++ \sup_{w \in \cW} |\Bias(w)|.
+\end{align*}
+%
+\end{proposition}
+
+The core concept in the proof of Proposition~\ref{pro:yurinskii_series} is to
+apply
+Corollary~\ref{cor:yurinskii_sa_martingale} with
+$S = \sum_{i=1}^n p(W_i) \varepsilon_i$
+and $p=\infty$ to construct $T \sim \cN\big(0, \Var[S]\big)$ such that
+$\|S - T \|_\infty$ is small, and then setting $G(w) = p(w)^\T H^{-1} T$. So
+long as the bias can be appropriately controlled, this result allows for
+uniform inference procedures such as uniform confidence bands or shape
+specification testing. The condition $k^3 / n \to 0$ is the same (up to logs)
+as that imposed by \citet{cattaneo2020large} for i.i.d. data, which gives the
+best known strong approximation rate for this problem. Thus,
+Proposition~\ref{pro:yurinskii_series} gives the same best approximation rate
+without
+requiring any extra restrictions for $\alpha$-mixing time series data.
+
+Our results improve substantially on \citet[Theorem~1]{li2020uniform}: using
+the notation of our Corollary~\ref{cor:yurinskii_sa_martingale}, and with any
+sequence
+$R_n \to \infty$, a valid (see
+Remark~\ref{rem:yurinskii_coupling_bounds_probability})
+version of their martingale Yurinskii coupling is
+%
+\begin{align*}
+\|S-T\|_2
+\lesssim_\P
+d^{1/2} r^{1/2}_n
++ (B_n d)^{1/3} R_n,
+\end{align*}
+%
+where $B_n = \sum_{i=1}^n \E[\|X_i\|_2^3]$ and $r_n$ is a term controlling the
+convergence of the quadratic variation, playing a similar role to our
+term $\E[\|\Omega\|_2]$. Under the assumptions of our
+Proposition~\ref{pro:yurinskii_series}, applying this
+result with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ yields a rate no better
+than $\|S-T\|_2 \lesssim_\P (n k)^{1/3} R_n$. As such, they attain a rate of
+strong approximation no faster than
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \hat\mu(w) - \mu(w) - G(w) \right|
+&\lesssim_\P
+\sqrt{\frac{k}{n}}
+\left( \frac{k^5}{n} \right)^{1/6} R_n
++ \sup_{w \in \cW} |\Bias(w)|.
+\end{align*}
+%
+Hence, for this approach to yield a valid strong approximation, the number of
+basis functions must satisfy $k^5/n \to 0$, a more restrictive assumption than
+our $k^3 / n \to 0$ (up to logs). This difference is due to
+\citet{li2020uniform} using the $\ell^2$-norm version of Yurinskii's coupling
+rather than the recently established $\ell^\infty$ version. Further,
+our approach allows for an improved rate of distributional approximation
+whenever the residuals have zero conditional third moment.
+
+To illustrate the statistical applicability of
+Proposition~\ref{pro:yurinskii_series}, consider constructing a feasible uniform
+confidence band for the regression function $\mu$, using standardization and
+Studentization for statistical power improvements. We assume throughout that
+the bias is negligible. Proposition~\ref{pro:yurinskii_series} and
+anti-concentration for
+Gaussian suprema \citep[Corollary~2.1]{chernozhukov2014anti} yield
+a distributional approximation for the supremum statistic whenever
+$k^3(\log n)^6 / n \to 0$, giving
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}}
+\right| \leq t
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{G(w)}{\sqrt{\rho(w,w)}}
+\right| \leq t
+\right)
+\right|
+&\to 0,
+\end{align*}
+%
+where $\rho(w,w') = \E[G(w)G(w')]$. Further, by a Gaussian--Gaussian
+comparison result \citep[Lemma~3.1]{chernozhukov2013gaussian} and
+anti-concentration, we show (see the proof of
+Proposition~\ref{pro:yurinskii_series}) that with $\bW = (W_1, \ldots, W_n)$ and
+$\bY = (Y_1, \ldots, Y_n)$,
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat\mu(w)-\mu(w)}{\sqrt{\hat\rho(w,w)}}
+\right| \leq t
+\right)
+- \P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
+\right| \leq t \biggm| \bW, \bY
+\right)
+\right|
+&\to_\P 0,
+\end{align*}
+%
+where $\hat G(w)$ is a zero-mean Gaussian process
+conditional on $\bW$ and $\bY$ with conditional covariance function
+$\hat\rho(w,w')
+=\E\big[\hat G(w) \hat G(w') \mid \bW, \bY \big]
+= p(w)^\T \hat H^{-1} \hat V \hat H^{-1}p(w')$
+for some estimator $\hat V$ satisfying
+$\frac{k (\log n)^2}{n}
+\big\|\hat V-\Var[S]\big\|_2 \to_\P 0$.
+For example, one could use the plug-in estimator
+$\hat V=\sum_{i=1}^n p(W_i) p(W_i)^\T \hat{\sigma}^2(W_i)$
+where $\hat{\sigma}^2(w)$ satisfies
+$(\log n)^2 \sup_{w \in \cW}
+|\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$.
+This leads to the following feasible and asymptotically valid
+$100(1-\tau)\%$
+uniform confidence band for partitioning-based series estimators
+based on martingale data.
+
+\begin{proposition}[Feasible uniform confidence bands for partitioning
+series estimators]%
+\label{pro:yurinskii_series_feasible}
+%
+Assume the setup of the preceding section. Then
+%
+\begin{align*}
+\P\Big(
+\mu(w) \in
+\Big[
+\hat\mu(w) \pm \hat q(\tau)
+\sqrt{\hat\rho(w,w)}
+\Big]
+\ \text{for all }
+w \in \cW \Big)
+\to 1-\tau,
+\end{align*}
+%
+where
+%
+\begin{align*}
+\hat{q}(\tau)
+&=
+\inf
+\left\{
+t \in \R:
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
+\right|
+\leq t
+\Bigm| \bW, \bY
+\right)
+\geq \tau
+\right\}
+\end{align*}
+%
+is the conditional quantile of the supremum of the Studentized Gaussian
+process. This can be estimated by resampling the conditional law of
+$\hat G(w) \mid \bW, \bY$ with a discretization of $w \in \cW$.
+\end{proposition}
+
+\subsection{Local polynomial estimators}
+\label{sec:yurinskii_local_poly}
+
+As a second example application we consider nonparametric regression estimation
+with martingale data employing local polynomial methods
+\citep{fan1996local}. In contrast with the partitioning-based series
+methods of Section~\ref{sec:yurinskii_series}, local polynomials induce
+stochastic
+processes which are not linearly separable, allowing us to showcase the
+empirical process result given in Proposition \ref{pro:yurinskii_emp_proc}.
+
+As before, suppose that
+$Y_i = \mu(W_i) + \varepsilon_i$
+for $ 1 \leq i \leq n$
+where $W_i$ has compact connected support $\cW \subseteq \R^m$,
+$\cH_i$ is the $\sigma$-algebra generated by
+$(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$,
+$\E[\varepsilon_i \mid \cH_{i-1}] = 0$,
+and $\mu: \cW \to \R$ is the estimand. Let $K$ be a kernel function on $\R^m$
+and $K_h(w) = h^{-m} K(w/h)$ for some bandwidth $h > 0$.
+Take $\gamma \geq 0$ a fixed polynomial order and let
+$k = (m+\gamma)!/(m!\gamma!)$ be the number of monomials up to order $\gamma$.
+Using multi-index notation,
+let $p(w)$ be the $k$-dimensional vector
+collecting the monomials $w^{\kappa}/\kappa!$
+for $0 \leq |\kappa| \leq \gamma$,
+and set $p_h(w) = p(w/h)$.
+The local polynomial regression estimator of $\mu(w)$ is,
+with $e_1 = (1, 0, \ldots, 0)^\T \in \R^k$ the first standard unit vector,
+%
+\begin{align*}
+\hat{\mu}(w)
+&=
+e_1^\T\hat{\beta}(w)
+&\text{where} &
+&\hat{\beta}(w)
+&=
+\argmin_{\beta \in \R^{k}}
+\sum_{i=1}^n
+\left(Y_i - p_h(W_i-w)^\T \beta \right)^2
+K_h(W_i-w).
+\end{align*}
+
+Our goal is again to approximate the distribution of the entire stochastic
+process, $(\hat{\mu}(w)-\mu(w):w\in\cW)$, which upon rescaling is non-Donsker
+if $h \to 0$, and decomposes as follows:
+%
+\begin{align*}
+\hat{\mu}(w)-\mu(w)
+&= e_1^\T H(w)^{-1} S(w)
++ e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w)
++ \Bias(w)
+\end{align*}
+%
+where
+$\hat H(w) = \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T$,
+$H(w) = \E \big[ \hat H(w) \big]$,
+$S(w)= \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i$,
+and
+$\Bias(w) = e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w)$.
+A key distinctive feature of local polynomial regression is that both
+$\hat H(w)$ and $S(w)$ are functions of the evaluation point $w\in\cW$;
+contrast this with the partitioning-based series estimator discussed in
+Section~\ref{sec:yurinskii_series}, for which neither $\hat H$ nor $S$ depend
+on $w$.
+Therefore we use Proposition \ref{pro:yurinskii_emp_proc} to obtain a Gaussian
+strong
+approximation for the martingale empirical process directly.
+
+Under mild regularity conditions, including stationarity for simplicity
+and an $\alpha$-mixing assumption on the time-dependence of the data, we show
+$\sup_{w\in\cW} \|\hat H(w)-H(w)\|_2
+\lesssim_\P \sqrt{n h^{-2m}\log n}$.
+Further,
+$\sup_{w\in\cW} |\Bias(w)|
+\lesssim_\P h^\gamma$
+provided that the regression function is sufficiently smooth.
+It remains to analyze the martingale empirical process given by
+$\big(e_1^\T H(w)^{-1} S(w) : w\in\cW\big)$
+via Proposition \ref{pro:yurinskii_emp_proc} by setting
+%
+\begin{align*}
+\cF = \left\{
+(W_i, \varepsilon_i) \mapsto
+e_1^\T H(w)^{-1}
+K_h(W_i-w) p_h(W_i-w) \varepsilon_i
+: w \in \cW
+\right\}.
+\end{align*}
+%
+With this approach, we obtain the following result.
+
+\begin{proposition}[Strong approximation for local polynomial estimators]%
+\label{pro:yurinskii_local_poly}
+
+Under the nonparametric regression setup described above,
+assume further that
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+$(W_i, \varepsilon_i)_{1 \leq i \leq n}$
+is strictly stationary.
+
+\item
+$(W_i, \varepsilon_i)_{1 \leq i \leq n}$
+is $\alpha$-mixing with mixing coefficients
+$\alpha(j) \leq e^{-2 j / C_\alpha}$
+for some $C_\alpha > 0$.
+
+\item
+$W_i$ has a Lebesgue density on $\cW$
+which is bounded above and away from zero.
+
+\item
+$\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$
+for $C_\varepsilon > 0$ and
+$\E\left[\varepsilon^2_i \mid \cH_{i-1}\right]=\sigma^2(W_i)$
+is bounded away from zero.
+
+\item
+$K$ is a non-negative Lipschitz
+compactly supported kernel with
+$\int K(w) \diff{w} = 1$.
+
+\end{enumerate}
+%
+Then for any $R_n \to \infty$,
+there is a zero-mean Gaussian process
+$T(w)$ on $\cW$
+with $\Var[T(w)] \asymp\frac{1}{n h^m}$
+satisfying
+$\Cov[T(w), T(w')]
+= \Cov[e_1^\T H(w)^{-1} S(w),\, e_1^\T H(w')^{-1} S(w')]$
+and
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|\hat \mu(w) - \mu(w) - T(w) \right|
+&\lesssim_\P
+\frac{R_n}{\sqrt{n h^m}}
+\left(
+\frac{(\log n)^{m+4}}{n h^{3m}}
+\right)^{\frac{1}{2m+6}}
++ \sup_{w \in \cW} |\Bias(w)|,
+\end{align*}
+%
+provided that the bandwidth sequence satisfies
+$n h^{3m} \to \infty$.
+%
+\end{proposition}
+
+If the residuals further satisfy
+$\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$, then
+a third-order Yurinskii coupling delivers an improved rate of strong
+approximation for Proposition~\ref{pro:yurinskii_local_poly}; this is omitted
+here for
+brevity. For completeness, the proof of
+Proposition~\ref{pro:yurinskii_local_poly}
+verifies that if the regression function $\mu(w)$ is $\gamma$ times
+continuously differentiable on $\cW$ then
+$\sup_w |\Bias(w)| \lesssim_\P h^\gamma$. Further, the assumption that $p(w)$
+is a vector of monomials is unnecessary in general; any collection of bounded
+linearly independent functions which exhibit appropriate approximation power
+will suffice \citep{eggermont2009maximum}. As such, we can encompass local
+splines and wavelets, as well as polynomials, and also choose whether or not to
+include interactions between the regressor variables. The bandwidth restriction
+of $n h^{3m} \to \infty$ is analogous to that imposed in
+Proposition~\ref{pro:yurinskii_series} for partitioning-based series
+estimators, and as
+far as we know, has not been improved upon for non-i.i.d.\ data.
+
+Applying an anti-concentration result for Gaussian process suprema, such as
+Corollary~2.1 in \citet{chernozhukov2014anti}, allows one to write a
+Kolmogorov--Smirnov bound comparing the law of
+$\sup_{w \in \cW}|\hat\mu(w) - \mu(w)|$ to that of $\sup_{w \in \cW}|T(w)|$.
+With an appropriate covariance estimator, we can further replace $T(w)$ by a
+feasible version $\hat T(w)$ or its Studentized counterpart, enabling
+procedures for uniform inference analogous to the confidence bands constructed
+in Section~\ref{sec:yurinskii_series}. We omit the details of this to conserve
+space but
+note that our assumptions on $W_i$ and $\varepsilon_i$ ensure that
+Studentization is possible even when the discretized covariance matrix has
+small eigenvalues (Section~\ref{sec:yurinskii_kde}), as we normalize only by
+the diagonal
+entries. \citet[Remark~3.1]{chernozhukov2014gaussian} achieve better rates for
+approximating the supremum of the $t$-process based on i.i.d.\ data in
+Kolmogorov--Smirnov distance by bypassing the step where we first approximate
+the entire stochastic process (see Section~\ref{sec:yurinskii_emp_proc} for a
+discussion).
+Nonetheless, our approach targeting the entire process allows for a
+potential future
+treatment of other functionals as well as the supremum.
+
+We finally remark that in this setting of kernel-based local empirical
+processes, it is essential that our initial strong approximation result
+(Corollary~\ref{cor:yurinskii_sa_martingale}) does not impose a lower bound on
+the
+eigenvalues of the variance matrix $\Sigma$. This effect was demonstrated by
+Lemma \ref{lem:yurinskii_kde_eigenvalue},
+Figure~\ref{fig:yurinskii_min_eig}, and their surrounding discussion in
+Section~\ref{sec:yurinskii_kde}. As such, the result of \citet{li2020uniform} is
+unsuited for this application, even in its simplest formulation,
+due to the strong minimum eigenvalue assumption.
+
+\section{Conclusion}
+\label{sec:yurinskii_conclusion}
+
+In this chapter we introduced as our main result a new version of Yurinskii's
+coupling which strictly generalizes all previously known forms of the result.
+Our formulation gave a Gaussian mixture coupling for approximate martingale
+vectors in $\ell^p$-norm where $1 \leq p \leq \infty$, with no restrictions on
+the minimum eigenvalues of the associated covariance matrices. We further
+showed how to obtain an improved approximation whenever third moments of the
+data are negligible. We demonstrated the applicability of this main result by
+first deriving a user-friendly version, and then specializing it to mixingales,
+martingales, and independent data, illustrating the benefits with a collection
+of simple factor models. We then considered the problem of constructing uniform
+strong approximations for martingale empirical processes, demonstrating how our
+new Yurinskii coupling can be employed in a stochastic process setting. As
+substantive illustrative applications of our theory to some
+well-established problems in statistical methodology, we showed how to use our
+coupling results for both vector-valued and empirical process-valued
+martingales in developing uniform inference procedures for partitioning-based
+series estimators and local polynomial models in nonparametric regression. At
+each stage we addressed issues of feasibility, compared our work with the
+existing literature, and provided implementable statistical inference
+procedures. The work in this chapter is based on \citet{cattaneo2022yurinskii}.
+
+\appendix
+
+
+\chapter{Supplement to Inference with Mondrian Random Forests}
+\label{app:mondrian}
+
+In this section we present the full proofs of all our results,
+and also state some useful technical preliminary and
+intermediate lemmas, along with some further properties
+of the Mondrian process not required for our primary analysis.
+See Section~\ref{sec:mondrian_overview_proofs} in the main text
+for an overview of the main proof strategies and a discussion of
+the challenges involved.
+We use the following simplified notation for convenience,
+whenever it is appropriate.
+We write $\I_{i b}(x) = \I \left\{ X_i \in T_b(x) \right\}$
+and $N_b(x) = \sum_{i=1}^{n} \I_{i b}(x)$,
+as well as $\I_b(x) = \I \left\{ N_b(x) \geq 1 \right\}$.
+
+\section{Preliminary lemmas}
+
+We begin by bounding the maximum size of any cell
+in a Mondrian forest containing $x$.
+This result is used regularly throughout many of our other proofs,
+and captures the ``localizing'' behavior of the Mondrian random
+forest estimator, showing that Mondrian cells have side lengths
+at most on the order of $1/\lambda$.
+
+\begin{lemma}[Upper bound on the largest cell in a Mondrian forest]%
+\label{lem:mondrian_app_largest_cell}
+%
+Let $T_1, \ldots, T_b \sim \cM\big([0,1]^d, \lambda\big)$
+and take $x \in (0,1)^d$. Then for all $t > 0$
+%
+\begin{align*}
+\P \left(
+\max_{1 \leq b \leq B}
+\max_{1 \leq j \leq d}
+|T_b(x)_j|
+\geq \frac{t}{\lambda}
+\right)
+&\leq
+2dB e^{-t/2}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell}]
+%
+We use the distribution of the Mondrian cell shape
+\citep[Proposition~1]{mourtada2020minimax}. We have
+$|T_b(x)_j| = \left( \frac{E_{bj1}}{\lambda} \wedge x_j \right)
++ \left( \frac{E_{bj2}}{\lambda} \wedge (1-x_j) \right)$
+where $E_{bj1}$ and $E_{bj2}$
+are i.i.d.\ $\Exp(1)$ variables for
+$1 \leq b \leq B$ and $1 \leq j \leq d$.
+Thus $|T_b(x)_j| \leq \frac{E_{bj1} + E_{bj2}}{\lambda}$
+so by a union bound
+%
+\begin{align*}
+\P \left(
+\max_{1 \leq b \leq B}
+\max_{1 \leq j \leq d}
+|T_b(x)_j|
+\geq \frac{t}{\lambda}
+\right)
+&\leq
+\P \left(
+\max_{1 \leq b \leq B}
+\max_{1 \leq j \leq d}
+(E_{bj1} \vee E_{bj2})
+\geq \frac{t}{2}
+\right) \\
+&\leq
+2dB\,
+\P \left(
+E_{bj1}
+\geq \frac{t}{2}
+\right)
+\leq
+2dB e^{-t/2}.
+\end{align*}
+%
+\end{proof}
+
+Next is another localization result,
+showing that the union
+of the cells $T_b(x)$ containing $x$ does not contain ``too many''
+samples $X_i$.
+Thus the Mondrian random forest estimator fitted at $x$
+only depends on $n/\lambda^d$ (the effective sample size)
+data points up to logarithmic terms.
+
+\begin{lemma}[Upper bound on the number of active data points]%
+\label{lem:mondrian_app_active_data}
+Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}
+hold,
+and define
+$N_{\cup}(x) =
+\sum_{i=1}^{n} \I \left\{ X_i \in \bigcup_{b=1}^{B} T_b(x) \right\}$.
+Then for $t > 0$ and sufficiently large $n$,
+with $\|f\|_\infty = \sup_{x \in [0,1]^d} f(x)$,
+%
+\begin{align*}
+\P \left( N_{\cup}(x) > t^{d+1}
+\frac{n}{\lambda^d}
+\|f\|_\infty
+\right)
+&\leq
+4 d B e^{-t/4}.
+\end{align*}
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_active_data}]
+
+Note
+$N_\cup(x) \sim
+\Bin\left(n, \int_{\bigcup_{b=1}^{B} T_b(x)} f(s) \diff s \right)
+\leq \Bin\left(n, 2^d \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
+|T_b(x)_j|^d \|f\|_\infty \right)$
+conditionally on $\bT$.
+If $N \sim \Bin(n,p)$ then, by Bernstein's inequality,
+$\P\left( N \geq (1 + t) n p\right)
+\leq \exp\left(-\frac{t^2 n^2 p^2 / 2}{n p(1-p) + t n p / 3}\right)
+\leq \exp\left(-\frac{3t^2 n p}{6 + 2t}\right)$.
+Thus for $t \geq 2$,
+%
+\begin{align*}
+\P \left( N_{\cup}(x) > (1+t) n \frac{2^d t^d}{\lambda^d}
+\|f\|_\infty
+\Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
+|T_j(x)| \leq \frac{t}{\lambda}
+\right)
+&\leq
+\exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right).
+\end{align*}
+%
+By Lemma~\ref{lem:mondrian_app_largest_cell},
+$\P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
+|T_j(x)| > \frac{t}{\lambda} \right)
+\leq 2 d B e^{-t/2}$.
+Hence
+%
+\begin{align*}
+&\P \left( N_{\cup}(x) > 2^{d+1} t^{d+1} \frac{n}{\lambda^d}
+\|f\|_\infty
+\right) \\
+&\quad\leq
+\P \left( N_{\cup}(x) > 2 t n \frac{2^d t^d}{\lambda^d}
+\|f\|_\infty
+\Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
+|T_j(x)| \leq \frac{t}{\lambda}
+\right)
++ \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d}
+|T_j(x)| > \frac{t}{\lambda}
+\right) \\
+&\quad\leq
+\exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right)
++ 2 d B e^{-t/2}.
+\end{align*}
+%
+Replacing $t$ by $t/2$ gives that for sufficiently large $n$ such that
+$n / \lambda^d \geq 1$,
+%
+\begin{align*}
+\P \left( N_{\cup}(x) > t^{d+1}
+\frac{n}{\lambda^d}
+\|f\|_\infty
+\right)
+&\leq
+4 d B e^{-t/4}.
+\end{align*}
+%
+\end{proof}
+
+Next we give a series of results culminating in a
+generalized moment bound for the denominator appearing
+in the Mondrian random forest estimator.
+We begin by providing a moment bound for the truncated inverse binomial
+distribution, which will be useful for controlling
+$\frac{\I_b(x)}{N_b(x)} \leq 1 \wedge \frac{1}{N_b(x)}$
+because conditional on $T_b$ we have
+$N_b(x) \sim \Bin \left( n, \int_{T_b(x)} f(s) \diff s \right)$.
+Our constants could be significantly suboptimal but they are sufficient
+for our applications.
+
+\begin{lemma}[An inverse moment bound for the binomial distribution]%
+\label{lem:mondrian_app_binomial_bound}
+For $n \geq 1$ and $p \in [0,1]$,
+let $N \sim \Bin(n, p)$ and $a_1, \ldots, a_k \geq 0$.
+Then
+%
+\begin{align*}
+\E\left[
+\prod_{j=1}^k
+\left(
+1 \wedge
+\frac{1}{N + a_j}
+\right)
+\right]
+&\leq
+(9k)^k
+\prod_{j=1}^k
+\left(
+1 \wedge
+\frac{1}{n p + a_j}
+\right).
+\end{align*}
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_bound}]
+By Bernstein's inequality,
+$\P\left( N \leq n p - t \right)
+\leq \exp\left(-\frac{t^2/2}{n p(1-p) + t/3}\right)
+\leq \exp\left(-\frac{3t^2}{6n p + 2t}\right)$.
+Therefore we have
+$\P\left( N \leq n p/4 \right)
+\leq \exp\left(-\frac{27 n^2 p^2 / 16}{6n p + 3 n p / 2}\right)
+= e^{-9 n p / 40}$.
+Partitioning by this event gives
+%
+\begin{align*}
+\E\left[
+\prod_{j=1}^k
+\left(
+1 \wedge
+\frac{1}{N + a_j}
+\right)
+\right]
+&\leq
+e^{-9 n p / 40}
+\prod_{j=1}^k
+\frac{1}{1 \vee a_j}
++ \prod_{j=1}^k
+\frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
+&\leq
+\prod_{j=1}^k
+\frac{1}{\frac{9 n p}{40k} + (1 \vee a_j)}
++ \prod_{j=1}^k
+\frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
+&\leq
+\prod_{j=1}^k
+\frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)}
++ \prod_{j=1}^k
+\frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\
+&\leq
+2 \prod_{j=1}^k
+\frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)}
+\leq
+2 \prod_{j=1}^k
+\frac{40k/9}{1 \vee \left(n p + a_j\right)} \\
+&\leq
+(9k)^k
+\prod_{j=1}^k
+\left(
+1 \wedge
+\frac{1}{n p + a_j}
+\right).
+\end{align*}
+\end{proof}
+
+Our next result is probably the most technically involved,
+allowing one to bound moments of
+(products of) $\frac{\I_b(x)}{N_b(x)}$ by the corresponding moments of
+(products of) $\frac{1}{n |T_b(x)|}$, again based on the heuristic
+that $N_b(x)$ is conditionally binomial so concentrates around
+its conditional expectation
+$n \int_{T_b(x)} f(x) \diff s \asymp n |T_b(x)|$.
+By independence of the trees,
+the latter expected products then factorize
+since the dependence on the data $X_i$ has been eliminated.
+The proof is complicated, and relies on the following induction procedure.
+First we consider the common refinement consisting of the
+subcells $\cR$ generated by all possible intersections
+of $T_b(x)$ over the selected trees
+(say $T_{b}(x), T_{b'}(x), T_{b''}(x)$
+though there could be arbitrarily many).
+Note that $N_b(x)$ is the sum of the number of
+samples $X_i$ in each such subcell in $\cR$.
+We then apply Lemma~\ref{lem:mondrian_app_binomial_bound} repeatedly
+to each subcell in $\cR$ in turn, replacing
+the number of samples $X_i$ in that subcell with its volume
+multiplied by $n$, and controlling the error incurred at each step.
+We record the subcells which have been ``checked'' in this manner
+using the class $\cD \subseteq \cR$ and proceed by finite induction,
+beginning with $\cD = \emptyset$ and ending at $\cD = \cR$.
+
+\begin{lemma}[Generalized moment bound for
+Mondrian random forest denominators]%
+\label{lem:mondrian_app_moment_denominator}
+
+Suppose Assumptions~\ref{ass:mondrian_data}
+and \ref{ass:mondrian_estimator} hold.
+Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$
+be independent and $k_b \geq 1$ for $1 \leq b \leq B_0$.
+Then with $k = \sum_{b=1}^{B_0} k_b$,
+for sufficiently large $n$,
+%
+\begin{align*}
+\E\left[
+\prod_{b=1}^{B_0}
+\frac{\I_b(x)}{N_b(x)^{k_b}}
+\right]
+&\leq
+\left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k}
+\prod_{b=1}^{B_0}
+\E \left[
+1 \wedge
+\frac{1}{(n |T_b(x)|)^{k_b}}
+\right].
+\end{align*}
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_moment_denominator}]
+
+Define the common refinement of
+$\left\{ T_b(x) : 1 \leq b \leq {B_0} \right\}$ as
+the class of sets
+%
+\begin{align*}
+\cR
+&= \left\{ \bigcap_{b=1}^{B_0} D_b :
+D_b \in
+\big\{ T_b(x), T_b(x)^{\comp} \big\}
+\right\}
+\bigsetminus
+\left\{
+\emptyset,\,
+\bigcap_{b=1}^{B_0}
+T_b(x)^\comp
+\right\}
+\end{align*}
+%
+and let $\cD \subset \cR$.
+We will proceed by induction on the elements of $\cD$,
+which represents the subcells we have checked,
+starting from $\cD = \emptyset$ and finishing at $\cD = \cR$.
+For $D \in \cR$ let
+$\cA(D) = \left\{ 1 \leq b \leq {B_0} : D \subseteq T_b(x) \right\}$
+be the indices of the trees which are active on subcell $D$,
+and for $1 \leq b \leq {B_0}$ let
+$\cA(b) = \left\{ D \in \cR : D \subseteq T_b(x) \right\}$
+be the subcells which are contained in $T_b(x)$,
+so that $b \in \cA(D) \iff D \in \cA(b)$.
+For a subcell $D \in \cR$,
+write $N_b(D) = \sum_{i=1}^{n} \I \left\{ X_i \in D \right\}$
+so that $N_b(x) = \sum_{D \in \cA(b)} N_b(D)$.
+Note that for any $D \in \cR \setminus \cD$,
+%
+\begin{align*}
+&\E \left[
+\prod_{b=1}^{B_0}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus \cD}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}
+}
+\right] \\
+&\quad=
+\E \left[
+\prod_{b \notin \cA(D)}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus \cD}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}
+} \right. \\
+&\left.
+\qquad
+\times\,\E\left[
+\prod_{b \in \cA(D)}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus \cD}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}
+} \right.\right. \\
+&\left.\left.
+\quad\qquad\qquad\biggm|
+\bT,
+N_b(D') : D' \in \cR
+\setminus
+(\cD \cup \{D\})
+\right]
+\right].
+\end{align*}
+%
+Now the inner conditional expectation is over $N_b(D)$ only.
+Since $f$ is bounded away from zero,
+%
+\begin{align*}
+N_b(D)
+&\sim \Bin\left(
+n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \
+\frac{\int_{D} f(s) \diff s}
+{1 - \int_{\bigcup \left( \cR \setminus \cD \right) \setminus D}
+f(s) \diff s}
+\right) \\
+&\geq \Bin\left(
+n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \
+|D| \inf_{x \in [0,1]^d} f(x)
+\right)
+\end{align*}
+%
+conditional on $\bT$ and
+$N_b(D') : D' \in \cR \setminus (\cD \cup \{D\})$.
+For sufficiently large $t$ by Lemma~\ref{lem:mondrian_app_active_data}
+%
+\begin{align*}
+\P \left(
+\sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D')
+> t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right)
+&\leq
+\P \left( N_{\cup}(x) > t^{d+1}
+\frac{n}{\lambda^d}
+\|f\|_\infty
+\right)
+\leq
+4 d B_0 e^{-t/4}.
+\end{align*}
+%
+Thus
+$N_b(D) \geq \Bin(n/2, |D| \inf_x f(x))$
+conditional on
+$\left\{ \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right\}$
+with probability at least
+$1 - 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}$.
+So by Lemma~\ref{lem:mondrian_app_binomial_bound},
+%
+\begin{align*}
+&\E \Bigg[
+\prod_{b \in \cA(D)} \!
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus \cD}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}
+}
+\biggm|
+\!
+\bT,
+N_b(D')\! : D' \in \cR \setminus \! (\cD \cup \{D\})
+\Bigg] \\
+&\quad\leq
+\E \! \left[
+\prod_{b \in \cA(D)}
+\frac{(9k)^{k_b}}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
+N_b(D')
++ n |D| \inf_x f(x) / 2
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}}
+\right] \\
+&\qquad+
+4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \\
+&\quad\leq
+\left( \frac{18k}{\inf_x f(x)} \right)^k
+\! \E \! \left[
+\prod_{b \in \cA(D)}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})}
+|D'|
+\right)^{k_b}}
+\right] \\
+&\qquad+
+4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}.
+\end{align*}
+%
+Therefore plugging this back into the marginal expectation yields
+%
+\begin{align*}
+&\E\left[
+\prod_{b=1}^{B_0}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus \cD}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap \cD}
+|D'|
+\right)^{k_b}
+}
+\right] \\
+&\quad\leq
+\left( \frac{18k}{\inf_x f(x)} \right)^k
+\E \left[
+\prod_{b=1}^{B_0}
+\frac{1}{
+1 \vee \left(
+\sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})}
+N_b(D')
++ n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})}
+|D'|
+\right)^{k_b}}
+\right] \\
+&\qquad+
+4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}.
+\end{align*}
+%
+Now we apply induction,
+starting with $\cD = \emptyset$ and
+adding $D \in \cR \setminus \cD$ to $\cD$ until
+$\cD = \cR$.
+This takes at most $|\cR| \leq 2^{B_0}$ steps and yields
+%
+\begin{align*}
+\E\left[
+\prod_{b=1}^{B_0}
+\frac{\I_b(x)}{N_b(x)^{k_b}}
+\right]
+&\leq
+\E\left[
+\prod_{b=1}^{B_0}
+\frac{1}{1 \vee N_b(x)^{k_b}}
+\right]
+=
+\E\left[
+\prod_{b=1}^{B_0}
+\frac{1}{1 \vee \left( \sum_{D \in \cA(b)} N_b(D) \right)^{k_b}}
+\right]
+\leq \cdots \\
+&\leq
+\left( \frac{18k}{\inf_x f(x)} \right)^{2^{B_0} k}
+\left(
+\prod_{b=1}^{B_0}
+\,\E \left[
+\frac{1}{1 \vee (n |T_b(x)|)^{k_b}}
+\right]
++ 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}
+\right),
+\end{align*}
+%
+where the expectation factorizes due to independence of $T_b(x)$.
+The last step is to remove the trailing exponential term.
+To do this, note that by Jensen's inequality,
+%
+\begin{align*}
+\prod_{b=1}^{B_0}
+\,\E \left[
+\frac{1}{1 \vee (n |T_b(x)|)^{k_b}}
+\right]
+&\geq
+\prod_{b=1}^{B_0}
+\frac{1}
+{\E \left[ 1 \vee (n |T_b(x)|)^{k_b} \right]}
+\geq
+\prod_{b=1}^{B_0}
+\frac{1}{n^{k_b}}
+= n^{-k}
+\geq
+4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}
+\end{align*}
+%
+for sufficiently large $n$
+because $B_0$, $d$, and $k$ are fixed while
+$\log \lambda \gtrsim \log n$.
+\end{proof}
+
+Now that moments of (products of) $\frac{\I_b(x)}{N_b(x)}$
+have been bounded by moments of
+(products of) $\frac{1}{n |T_b(x)|}$, we establish further
+explicit bounds for these in the next result.
+Note that the problem has been reduced to determining
+properties of Mondrian cells, so once again we return to the
+exact cell shape distribution given by \citet{mourtada2020minimax},
+and evaluate the appropriate expectations by integration.
+Note that the truncation by taking the minimum with one inside the expectation
+is essential here, as otherwise second moment of the inverse Mondrian cell
+volume is not even finite. As such, there is a ``penalty'' of $\log n$
+when bounding truncated second moments,
+and the upper bound for the $k$th moment is significantly
+larger than the naive assumption of $(\lambda^d / n)^k$
+whenever $k \geq 3$.
+This ``small cell'' phenomenon in which the inverse volumes of Mondrian cells
+have heavy tails is a recurring challenge.
+
+\begin{lemma}[Inverse moments of the volume of a Mondrian cell]%
+\label{lem:mondrian_app_moment_cell}
+
+Suppose Assumption~\ref{ass:mondrian_estimator} holds
+and let $T \sim \cM\big([0,1]^d, \lambda\big)$.
+Then for sufficiently large $n$,
+%
+\begin{align*}
+\E\left[
+1 \wedge
+\frac{1}{(n |T(x)|)^k}
+\right]
+&\leq
+\left(
+\frac{\lambda^d}{n}
+\right)^{\I \left\{ k = 1 \right\}}
+\left(
+\frac{3 \lambda^{2d} \log n}{n^2}
+\right)^{\I \left\{ k \geq 2 \right\}}
+\prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}.
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_moment_cell}]
+
+By \citet[Proposition~1]{mourtada2020minimax},
+$|T(x)| = \prod_{j=1}^{d}
+\left(
+\left(\frac{1}{\lambda} E_{j1} \right) \wedge x_j
++ \left( \frac{1}{\lambda} E_{j2} \right) \wedge (1-x_j)
+\right)$
+where $E_{j1}$ and $E_{j2}$
+are mutually independent $\Exp(1)$ random variables.
+Thus for $0<t<1$,
+using the fact that $E_{j1} + E_{j2} \sim \Gam(2, 1)$,
+%
+\begin{align*}
+\E \left[
+\frac{1}{1 \vee (n |T(x)|)^k}
+\right]
+&\leq
+\frac{1}{n^k}
+\E \left[
+\frac{\I\{\min_j (E_{j1} + E_{j2}) \geq t\}}{|T(x)|^k}
+\right]
++ \P \left(\min_{1 \leq j \leq d} (E_{j1} + E_{j2}) < t\right) \\
+&\leq
+\frac{1}{n^k}
+\prod_{j=1}^d
+\E \left[
+\frac{\I\{E_{j1} + E_{j2} \geq t\}}
+{\left(\frac{1}{\lambda} E_{j1} \wedge x_j
++ \frac{1}{\lambda} E_{j2} \wedge (1-x_j)\right)^k}
+\right]
++ d\, \P \left(E_{j1} < t\right) \\
+&\leq
+\frac{\lambda^{d k}}{n^k}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)}
+\E \left[
+\frac{\I\{E_{j1} + E_{j2} \geq t\}}
+{(E_{j1} + E_{j2})^k \wedge 1}
+\right]
++ d (1 - e^{-t}) \\
+&\leq
+\frac{\lambda^{d k}}{n^k}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)}
+\int_{t}^{1}
+\frac{e^{-s}}{s^{k-1}}
+\diff s
++ d t \\
+&\leq
+d t
++ \frac{\lambda^{d k}}{n^k}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)}
+\times
+\begin{cases}
+1-t & \text{if } k = 1, \\
+-\log t & \text{if } k = 2.
+\end{cases}
+\end{align*}
+%
+If $k>2$ we use
+$\frac{1}{1 \vee (n |T(x)|)^k} \leq \frac{1}{1 \vee (n |T(x)|)^{k-1}}$
+to reduce $k$. Now if $k = 1$ we let $t \to 0$, giving
+%
+\begin{align*}
+\E \left[
+\frac{1}{1 \vee (n |T(x)|)}
+\right]
+&\leq
+\frac{\lambda^d}{n}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)},
+\end{align*}
+%
+and if $k = 2$ then we set $t = 1/n^2$ so that for
+sufficiently large $n$,
+%
+\begin{align*}
+\E \left[
+\frac{1}{1 \vee (n |T(x)|)^2}
+\right]
+&\leq
+\frac{d}{n^2}
++ \frac{2 \lambda^{2d} \log n}{n^2}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)}
+\leq
+\frac{3 \lambda^{2d} \log n}{n^2}
+\prod_{j=1}^d
+\frac{1}{x_j(1-x_j)}.
+\end{align*}
+%
+Lower bounds which match up to constants for the first moment and up to
+logarithmic terms for the second moment are obtained as
+$\E \left[ 1 \wedge \frac{1}{(n|T(x)|)^2} \right]
+\geq \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]^2$
+by Jensen, and
+%
+\begin{align*}
+\E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]
+&\geq \frac{1}{1 + n \E \left[ |T(x)| \right]}
+\geq \frac{1}{1 + 2^d n / \lambda^d}
+\gtrsim \frac{\lambda^d}{n}.
+\end{align*}
+\end{proof}
+
+The endeavor to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ is
+concluded with the next result, combining the previous two lemmas to give a
+bound without expectations on the right.
+
+\begin{lemma}[Simplified generalized moment bound for
+Mondrian forest denominators]%
+\label{lem:mondrian_app_simple_moment_denominator}
+%
+Suppose Assumptions~\ref{ass:mondrian_data}
+and \ref{ass:mondrian_estimator} hold.
+Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$
+and $k_b \geq 1$ for $1 \leq b \leq B_0$.
+Then with $k = \sum_{b=1}^{B_0} k_b$,
+%
+\begin{align*}
+&\E\left[
+\prod_{b=1}^{B_0}
+\frac{\I_b(x)}{N_b(x)^{k_b}}
+\right] \\
+&\quad\leq
+\left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k}
+\left(
+\prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}
+\right)^{B_0}
+\prod_{b=1}^{B_0}
+\left(
+\frac{\lambda^d}{n}
+\right)^{\I \left\{ k_b = 1 \right\}}
+\left(
+\frac{\lambda^{2d} \log n}{n^2}
+\right)^{\I \left\{ k_b \geq 2 \right\}}
+\end{align*}
+%
+for sufficiently large $n$.
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_simple_moment_denominator}]
+This follows directly from
+Lemmas~\ref{lem:mondrian_app_moment_denominator} and
+\ref{lem:mondrian_app_moment_cell}.
+\end{proof}
+
+Our final preliminary lemma is concerned with further properties of
+the inverse truncated binomial distribution, again with the aim
+of analyzing $\frac{\I_b(x)}{N_b(x)}$.
+This time, instead of merely upper bounding the moments,
+we aim to give convergence results for those moments,
+again in terms of moments of $\frac{1}{n |T_b(x)|}$.
+This time we only need to handle the first
+and second moment, so this result does not strictly generalize
+Lemma~\ref{lem:mondrian_app_binomial_bound} except in simple cases.
+The proof is by Taylor's theorem and the Cauchy--Schwarz inequality,
+using explicit expressions for moments of the binomial distribution
+and bounds from Lemma~\ref{lem:mondrian_app_binomial_bound}.
+
+\begin{lemma}[Expectation inequalities for the binomial distribution]%
+\label{lem:mondrian_app_binomial_expectation}
+Let $N \sim \Bin(n, p)$ and take $a, b \geq 1$. Then
+%
+\begin{align*}
+0
+&\leq
+\E \left[
+\frac{1}{N+a}
+\right]
+- \frac{1}{n p+a}
+\leq
+\frac{2^{19}}{(n p+a)^2}, \\
+0
+&\leq
+\E \left[
+\frac{1}{(N+a)(N+b)}
+\right]
+- \frac{1}{(n p+a)(n p+b)}
+\leq
+\frac{2^{27}}{(n p +a)(n p +b)}
+\left(
+\frac{1}{n p + a}
++ \frac{1}{n p + b}
+\right).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_expectation}]
+
+For the first result,
+Taylor's theorem with Lagrange remainder
+for $N \mapsto \frac{1}{N+a}$ around $n p$ gives
+%
+\begin{align*}
+\E \left[
+\frac{1}{N+a}
+\right]
+&=
+\E \left[
+\frac{1}{n p+a}
+- \frac{N - n p}{(n p+a)^2}
++ \frac{(N - n p)^2}{(\xi+a)^3}
+\right]
+\end{align*}
+%
+for some $\xi$ between $n p$ and $N$. The second term in the expectation
+is zero-mean, showing the non-negativity part, and the
+Cauchy--Schwarz inequality for the remaining term gives
+%
+\begin{align*}
+\E \left[
+\frac{1}{N+a}
+\right]
+- \frac{1}{n p+a}
+&\leq
+\E \left[
+\frac{(N - n p)^2}{(n p+a)^3}
++ \frac{(N - n p)^2}{(N+a)^3}
+\right] \\
+&\leq
+\frac{\E\big[(N - n p)^2\big]}{(n p+a)^3}
++ \sqrt{
+\E\big[(N - n p)^4\big]
+\E \left[
+\frac{1}{(N+a)^6}
+\right]}.
+\end{align*}
+%
+Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$
+and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that
+%
+\begin{align*}
+\E \left[
+\frac{1}{N+a}
+\right]
+- \frac{1}{n p+a}
+&\leq
+\frac{n p}{(n p+a)^3}
++ \sqrt{\frac{54^6 n p(1+3 n p)}{(n p + a)^6}}
+\leq
+\frac{2^{19}}{(n p+a)^2}.
+\end{align*}
+%
+For the second result,
+Taylor's theorem applied to $N \mapsto \frac{1}{(N+a)(N+b)}$
+around $n p$ gives
+%
+\begin{align*}
+\E \left[
+\frac{1}{(N+a)(N+b)}
+\right]
+&=
+\E \left[
+\frac{1}{(n p+a)(n p + b)}
+- \frac{(N - n p)(2 n p + a + b)}{(n p + a)^2 (n p + b)^2}
+\right] \\
+&\quad+
+\E \left[
+\frac{(N - n p)^2}{(\xi+a)(\xi+b)}
+\left(
+\frac{1}{(\xi + a)^2}
++ \frac{1}{(\xi + a)(\xi + b)}
++ \frac{1}{(\xi + b)^2}
+\right)
+\right]
+\end{align*}
+%
+for some $\xi$ between $n p$ and $N$. The second term on the right is
+zero-mean, showing non-negativity, and applying the Cauchy--Schwarz
+inequality to the remaining term gives
+%
+\begin{align*}
+&\E \left[
+\frac{1}{(N+a)(N+b)}
+\right]
+- \frac{1}{n p+a} \\
+&\quad\leq
+\E \left[
+\frac{2 (N - n p)^2}{(N+a)(N+b)}
+\left(
+\frac{1}{(N + a)^2}
++ \frac{1}{(N + b)^2}
+\right)
+\right] \\
+&\qquad+
+\E \left[
+\frac{2 (N - n p)^2}{(n p +a)(n p +b)}
+\left(
+\frac{1}{(n p + a)^2}
++ \frac{1}{(n p + b)^2}
+\right)
+\right] \\
+&\quad\leq
+\sqrt{
+4 \E \left[ (N - n p)^4 \right]
+\E \left[
+\frac{1}{(N + a)^6 (N+b)^2}
++ \frac{1}{(N + b)^6 (N+a)^2}
+\right]} \\
+&\qquad+
+\frac{2 \E\big[(N - n p)^2\big]}{(n p +a)(n p +b)}
+\left(
+\frac{1}{(n p + a)^2}
++ \frac{1}{(n p + b)^2}
+\right).
+\end{align*}
+%
+Now we use
+$\E\big[(N - n p)^4\big] \leq n p(1+3n p)$
+and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that
+%
+\begin{align*}
+\E \left[
+\frac{1}{(N+a)(N+b)}
+\right]
+- \frac{1}{n p+a}
+&\leq
+\sqrt{
+\frac{4n p (1 + 3n p) \cdot 72^8}{(n p + a)^2 (n p + b)^2}
+\left(
+\frac{1}{(n p + a)^4}
++ \frac{1}{(n p + b)^4}
+\right)} \\
+&\quad+
+\frac{2 n p}{(n p +a)(n p +b)}
+\left(
+\frac{1}{(n p + a)^2}
++ \frac{1}{(n p + b)^2}
+\right) \\
+&\leq
+\frac{2^{27}}{(n p + a) (n p + b)}
+\left(
+\frac{1}{n p + a}
++ \frac{1}{n p + b}
+\right).
+\end{align*}
+%
+\end{proof}
+
+\section{Proofs of main results}
+\label{sec:mondrian_app_proofs}
+
+\subsection{Mondrian random forests}
+
+We give rigorous proofs of the central limit theorem,
+bias characterization, and variance estimation
+results for the Mondrian random forest estimator without debiasing.
+See Section~\ref{sec:mondrian_overview_proofs} in the main text
+for details on our approaches to these proofs.
+
+\begin{proof}[Theorem~\ref{thm:mondrian_clt}]
+From the debiased version
+(Theorem~\ref{thm:mondrian_clt_debiased}) with $J=0$, $a_0 = 1$, and
+$\omega_0 = 1$.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_bias}]
+
+\proofparagraph{removing the dependence on the trees}
+
+By measurability and with $\mu(X_i) = \E[Y_i \mid X_i]$ almost surely,
+%
+\begin{align*}
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \mu(x)
+&=
+\frac{1}{B}
+\sum_{b=1}^B
+\sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
+\frac{\I_{i b}(x)}{N_b(x)}.
+\end{align*}
+%
+Conditional on $\bX$,
+the terms in the outer sum depend only on $T_b$ so are i.i.d.
+As $\mu$ is Lipschitz,
+%
+\begin{align*}
+&\Var \big[
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \mu(x)
+\mid \bX
+\big]
+\leq
+\frac{1}{B}
+\E \left[
+\left(
+\sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big)
+\frac{\I_{i b}(x)}{N_b(x)}
+\right)^2
+\Bigm| \bX
+\right] \\
+&\quad\lesssim
+\frac{1}{B}
+\E \left[
+\max_{1 \leq i \leq n}
+\big\| X_i - x \big\|_2^2
+\left(
+\sum_{i=1}^n
+\frac{\I_{i b}(x)}{N_b(x)}
+\right)^2
+\Bigm| \bX
+\right]
+\lesssim
+\frac{1}{B}
+\sum_{j=1}^{d}
+\E \left[
+|T(x)_j|^2
+\right]
+\lesssim
+\frac{1}{\lambda^2 B},
+\end{align*}
+%
+using the law of $T(x)_j$ from \citet[Proposition~1]{mourtada2020minimax}.
+By Chebyshev's inequality,
+%
+\begin{align*}
+\big|
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \E \left[ \hat \mu(x) \mid \bX \right]
+\big|
+&\lesssim_\P
+\frac{1}{\lambda \sqrt B}.
+\end{align*}
+
+\proofparagraph{showing the conditional bias converges in probability}
+
+Now $\E \left[ \hat\mu(x) \mid \bX \right]$
+is a non-linear function of the i.i.d.\ random variables $X_i$,
+so we use the Efron--Stein inequality
+\citep{efron1981jackknife} to bound its variance.
+Let $\tilde X_{i j} = X_i$ if $i \neq j$ and be an
+independent copy of $X_j$, denoted $\tilde X_j$, if $i = j$.
+Write $\tilde \bX_j = (\tilde X_{1j}, \ldots, \tilde X_{n j})$
+and similarly
+$\tilde \I_{i j b}(x) = \I \big\{ \tilde X_{i j} \in T_b(x) \big\}$
+and $N_{j b}(x) = \sum_{i=1}^{n} \tilde \I_{i j b}(x)$.
+%
+\begin{align}
+\nonumber
+&\Var \left[
+\sum_{i=1}^{n}
+\big( \mu(X_i) - \mu(x) \big)
+\E \left[
+\frac{\I_{i b}(x)}{N_b(x)}
+\Bigm| \bX
+\right]
+\right] \\
+\nonumber
+&\quad\leq
+\frac{1}{2}
+\sum_{j=1}^{n}
+\E \! \left[
+\! \left(
+\sum_{i=1}^{n}
+\big( \mu(X_i) - \mu(x) \big)
+\E \! \left[
+\frac{\I_{i b}(x)}{N_b(x)}
+\Bigm| \bX
+\right]
+- \sum_{i=1}^{n}
+\left( \mu(\tilde X_{i j}) - \mu(x) \right)
+\E \! \left[
+\frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)}
+\Bigm| \tilde \bX_j
+\right]
+\right)^{\! \! 2}
+\right] \\
+\nonumber
+&\quad\leq
+\frac{1}{2}
+\sum_{j=1}^{n}
+\E \left[
+\left(
+\sum_{i=1}^{n}
+\left(
+\big( \mu(X_i) - \mu(x) \big)
+\frac{\I_{i b}(x)}{N_b(x)}
+- \left( \mu(\tilde X_{i j}) - \mu(x) \right)
+\frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)}
+\right)
+\right)^2
+\right] \\
+\nonumber
+&\quad\leq
+\sum_{j=1}^{n}
+\E \left[
+\left(
+\sum_{i \neq j}
+\big( \mu(X_i) - \mu(x) \big)
+\left(
+\frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)}
+\right)
+\right)^{\!\!2} \,
+\right] \\
+\label{eq:mondrian_app_bias_efron_stein}
+&\qquad+
+2 \sum_{j=1}^{n}
+\E \left[
+\left( \mu(X_j) - \mu(x) \right)^2
+\frac{\I_{j b}(x)}{N_b(x)^2}
+\right].
+\end{align}
+%
+For the first term in \eqref{eq:mondrian_app_bias_efron_stein} to be non-zero,
+we must have $|N_b(x) - \tilde N_{j b}(x)| = 1$.
+Writing $N_{-j b}(x) = \sum_{i \neq j} \I_{i b}(x)$,
+assume by symmetry that
+$\tilde N_{j b}(x) = N_{-j b}(x)$ and $N_b(x) = N_{-j b}(x) + 1$,
+and $\I_{j b}(x) = 1$.
+As $f$ is bounded and $\mu$ is Lipschitz,
+writing $\I_{-j b}(x) = \I \left\{ N_{-j b}(x) \geq 1 \right\}$,
+%
+\begin{align*}
+&\sum_{j=1}^{n}
+\E \left[
+\left(
+\sum_{i \neq j}
+\left( \mu(X_i) - \mu(x) \right)
+\left(
+\frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)}
+\right)
+\right)^{\! 2} \,
+\right] \\
+&\quad\lesssim
+\sum_{j=1}^{n}
+\E \left[
+\max_{1 \leq l \leq d}
+|T_b(x)_l|^2
+\left(
+\frac{\sum_{i \neq j}\I_{i b}(x) \I_{j b}(x)}
+{N_{-j b}(x)(N_{-j b}(x) + 1)}
+\right)^2
+\right]
+\lesssim
+\E \left[
+\max_{1 \leq l \leq d}
+|T_b(x)_l|^2
+\frac{\I_{b}(x)}{N_{b}(x)}
+\right].
+\end{align*}
+%
+For $t > 0$, partition by
+$\left\{ \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right\}$
+and apply Lemma~\ref{lem:mondrian_app_largest_cell} and
+Lemma~\ref{lem:mondrian_app_simple_moment_denominator}:
+%
+\begin{align*}
+\E \left[
+\max_{1 \leq l \leq d}
+|T_b(x)_l|^2
+\frac{\I_{b}(x)}{N_{b}(x)}
+\right]
+&\leq
+\P \left(
+\max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda
+\right)
++ (t / \lambda)^2\,
+\E \left[
+\frac{\I_{b}(x)}{N_{b}(x)}
+\right] \\
+&\lesssim
+e^{-t/2}
++ \left( \frac{t}{\lambda} \right)^2
+\frac{\lambda^d}{n}
+\lesssim
+\frac{1}{n^2}
++ \frac{(\log n)^2}{\lambda^2}
+\frac{\lambda^d}{n}
+\lesssim
+\frac{(\log n)^2}{\lambda^2}
+\frac{\lambda^{d}}{n},
+\end{align*}
+%
+where we set $t = 4 \log n$.
+For the second term in \eqref{eq:mondrian_app_bias_efron_stein} we have
+%
+\begin{align*}
+\sum_{j=1}^{n}
+\E \left[
+\left( \mu(X_j) - \mu(x) \right)^2
+\frac{\I_{j b}(x)}{N_b(x)^2}
+\right]
+&\lesssim
+\E \left[
+\max_{1 \leq l \leq d}
+|T_b(x)_l|^{2}
+\frac{\I_{b}(x)}{N_b(x)}
+\right]
+\lesssim
+\frac{(\log n)^2}{\lambda^2}
+\frac{\lambda^{d}}{n}
+\end{align*}
+%
+in the same manner.
+Hence
+%
+\begin{align*}
+\Var \left[
+\sum_{i=1}^{n}
+\left( \mu(X_i) - \mu(x) \right)
+\E \left[
+\frac{\I_{i b}(x)}{N_b(x)}
+\Bigm| \bX
+\right]
+\right]
+&\lesssim
+\frac{(\log n)^2}{\lambda^2}
+\frac{\lambda^{d}}{n},
+\end{align*}
+%
+and so by Chebyshev's inequality,
+%
+\begin{align*}
+\big|
+\E \left[ \hat \mu(x) \mid \bX, \bT \right]
+- \E \left[ \hat \mu(x) \right]
+\big|
+&\lesssim_\P
+\frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda}
+\sqrt{ \frac{\lambda^{d}}{n} }.
+\end{align*}
+
+\proofparagraph{computing the limiting bias}
+
+It remains to compute the limit of
+$\E \left[ \hat \mu(x) \right] - \mu(x)$.
+Let $\bX_{-i} = (X_1, \ldots, X_{i-1}, X_{i+1}, \ldots, X_n)$
+and $N_{-i b}(x) = \sum_{j=1}^n \I\{j \neq i\} \I\{X_j \in T_b(x)\}$.
+Then
+%
+\begin{align*}
+&\E \left[ \hat \mu(x) \right]
+- \mu(x)
+=
+\E \left[
+\sum_{i=1}^{n}
+\left( \mu(X_i) - \mu(x) \right)
+\frac{\I_{i b}(x)}{N_b(x)}
+\right] \\
+&\quad=
+\sum_{i=1}^{n}
+\E \left[
+\E \left[
+\frac{\left( \mu(X_i) - \mu(x) \right)\I_{i b}(x)}
+{N_{-i b}(x) + 1}
+\bigm| \bT, \bX_{-i}
+\right]
+\right]
+= n \,
+\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{N_{-i b}(x) + 1}
+\right].
+\end{align*}
+%
+By Lemma~\ref{lem:mondrian_app_binomial_expectation}, as
+$N_{-i b}(x) \sim \Bin\left(n-1,
+\int_{T_b(x)} f(s) \diff s \right)$
+given $\bT$ and $f$ is bounded below,
+%
+\begin{align*}
+\left|
+\E \! \left[
+\frac{1}{N_{-i b}(x) + 1}
+\Bigm| \bT
+\right]
+- \frac{1}{(n-1) \! \int_{T_b(x)} \! f(s) \diff s + 1}
+\right|
+&\lesssim
+\frac{1}{n^2 \! \left( \int_{T_b(x)} f(s) \diff s \right)^2}
+\wedge 1
+\lesssim
+\frac{1}{n^2 |T_b(x)|^2}
+\wedge 1,
+\end{align*}
+%
+and also
+%
+\begin{align*}
+\left|
+\frac{1}{(n-1) \int_{T_b(x)} f(s) \diff s + 1}
+- \frac{1}{n \int_{T_b(x)} f(s) \diff s}
+\right|
+&\lesssim
+\frac{1}{n^2 \left( \int_{T_b(x)} f(s) \diff s\right)^2}
+\wedge 1
+\lesssim
+\frac{1}{n^2 |T_b(x)|^2}
+\wedge 1.
+\end{align*}
+%
+So by Lemmas~\ref{lem:mondrian_app_largest_cell}
+and \ref{lem:mondrian_app_moment_cell},
+since $f$ is Lipschitz and bounded, using Cauchy--Schwarz,
+%
+\begin{align*}
+&\left|
+\E \left[ \hat \mu(x) \right]
+- \mu(x)
+- \E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{\int_{T_b(x)} f(s) \diff s}
+\right]
+\right|
+\lesssim
+\E \left[
+\frac{n \int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
+{n^2 |T_b(x)|^2 \vee 1}
+\right] \\
+&\qquad\lesssim
+\E \left[
+\frac{\max_{1 \leq l \leq d} |T_b(x)_l| }
+{n |T_b(x)| \vee 1}
+\right] \\
+&\qquad\lesssim
+\frac{2 \log n}{\lambda} \,
+\E \left[
+\frac{1}{n |T_b(x)| \vee 1}
+\right]
++ \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| >
+\frac{2 \log n}{\lambda} \right)^{1/2}
+\E \left[
+\frac{1}
+{n^2 |T_b(x)|^2 \vee 1}
+\right]^{1/2} \\
+&\qquad\lesssim
+\frac{\log n}{\lambda} \,
+\frac{\lambda^d}{n}
++ \frac{d}{n}
+\frac{\lambda^d \sqrt{\log n}}{n}
+\lesssim
+\frac{\log n}{\lambda} \,
+\frac{\lambda^d}{n}.
+\end{align*}
+%
+Next set
+$A = \frac{1}{f(x) |T_b(x)|} \int_{T_b(x)} (f(s) - f(x)) \diff s
+\geq \inf_{s \in [0,1]^d} \frac{f(s)}{f(x)} - 1$.
+Use the Maclaurin series of $\frac{1}{1+x}$
+up to order $\flbeta$ to see
+$\frac{1}{1 + A} = \sum_{k=0}^{\flbeta} (-1)^k A^k
++ O \left( A^{\flbeta + 1} \right)$.
+Hence
+%
+\begin{align*}
+&\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{\int_{T_b(x)} f(s) \diff s}
+\right]
+=
+\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{f(x) |{T_b(x)}|}
+\frac{1}{1 + A}
+\right] \\
+&\quad=
+\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{f(x) |{T_b(x)}|}
+\left(
+\sum_{k=0}^{\flbeta}
+(-1)^k
+A^k
++ O \left( |A|^{\flbeta + 1} \right)
+\right)
+\right].
+\end{align*}
+%
+Note that since $f$ and $\mu$ are Lipschitz,
+and by integrating the tail probability given in
+Lemma~\ref{lem:mondrian_app_largest_cell}, the Maclaurin remainder term is
+bounded by
+%
+\begin{align*}
+&\E \left[
+\frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
+{f(x) |{T_b(x)}|}
+|A|^{\flbeta + 1}
+\right] \\
+&\qquad=
+\E \left[
+\frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s}
+{f(x) |{T_b(x)}|}
+\left(
+\frac{1}{f(x) |{T_b(x)}|} \int_{T_b(x)} (f(s) - f(x)) \diff s
+\right)^{\flbeta + 1}
+\right] \\
+&\qquad\lesssim
+\E \left[
+\max_{1 \leq l \leq d}
+|T_b(x)_l|^{\flbeta+2}
+\right]
+=
+\int_{0}^{\infty}
+\P \left(
+\max_{1 \leq l \leq d}
+|T_b(x)_l|
+\geq t^{\frac{1}{\flbeta+2}}
+\right)
+\diff t
+\leq
+\int_{0}^{\infty}
+2 d e^{- \lambda t^{\frac{1}{\flbeta+2}} / 2}
+\diff t \\
+&\qquad=
+\frac{2^{\flbeta + 3} d (\flbeta + 2)! }
+{\lambda^{\flbeta + 2}}
+\lesssim
+\frac{1}{\lambda^{\beta}},
+\end{align*}
+%
+since $\int_0^\infty e^{-a x^{1/k}} \diff x
+= a^{-k} k!$.
+To summarize the progress so far, we have
+%
+\begin{align*}
+&\left|
+\E \left[
+\hat \mu(x)
+\right]
+- \mu(x)
+- \sum_{k=0}^{\flbeta}
+(-1)^k \,
+\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{f(x)^{k+1} |T_b(x)|^{k+1}}
+\left(
+\int_{T_b(x)} (f(s) - f(x)) \diff s
+\right)^k
+\right]
+\right| \\
+&\qquad\lesssim
+\frac{\log n}{\lambda}
+\frac{\lambda^d}{n}
++ \frac{1}{\lambda^\beta}.
+\end{align*}
+%
+We evaluate the expectation.
+By Taylor's theorem, with $\nu$ a multi-index,
+as $f \in \cH^\beta$,
+%
+\begin{align*}
+\left(
+\int_{T_b(x)} (f(s) - f(x)) \diff s
+\right)^k
+&=
+\left(
+\sum_{|\nu| = 1}^\flbeta
+\frac{\partial^\nu f(x)}{\nu !}
+\! \int_{T_b(x)}
+\!\! (s - x)^\nu
+\diff s
+\right)^k
++ O \! \left(
+|T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta
+\right).
+\end{align*}
+%
+Next, by the multinomial theorem
+with a multi-index $u$ indexed by $\nu$ with $|\nu| \geq 1$,
+%
+\begin{align*}
+\left(
+\sum_{|\nu| = 1}^\flbeta
+\frac{\partial^\nu f(x)}{\nu !}
+\int_{T_b(x)}
+(s - x)^\nu
+\diff s
+\right)^k
+&=
+\sum_{|u| = k}
+\binom{k}{u}
+\left(
+\frac{\partial^\nu f(x)}{\nu !}
+\int_{T_b(x)} (s-x)^\nu \diff s
+\right)^u
+\end{align*}
+%
+where $\binom{k}{u}$ is a multinomial coefficient.
+By Taylor's theorem with $f, \mu \in \cH^\beta$,
+%
+\begin{align*}
+&\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s \\
+&\quad=
+\sum_{|\nu'|=1}^{\flbeta}
+\sum_{|\nu''|=0}^{\flbeta}
+\frac{\partial^{\nu'} \mu(x)}{\nu' !}
+\frac{\partial^{\nu''} f(x)}{\nu'' !}
+\int_{T_b(x)} (s-x)^{\nu' + \nu''} \diff s
++ O \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right).
+\end{align*}
+%
+Now by integrating the tail probabilities in
+Lemma~\ref{lem:mondrian_app_largest_cell},
+$ \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right]
+\lesssim \frac{1}{\lambda^\beta}$.
+Therefore, by Lemma~\ref{lem:mondrian_app_moment_cell},
+writing $T_b(x)^\nu$ for $\int_{T_b(x)} (s-x)^\nu \diff s$,
+%
+\begin{align*}
+&\sum_{k=0}^{\flbeta}
+(-1)^k \,
+\E \left[
+\frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s}
+{f(x)^{k+1} |T_b(x)|^{k+1}}
+\left(
+\int_{T_b(x)} (f(s) - f(x)) \diff s
+\right)^k
+\right] \\
+&\,=
+\! \sum_{k=0}^{\flbeta}
+(-1)^k \,
+\E \!
+\left[
+\! \frac{
+\sum_{|\nu'|=1}^{\flbeta}
+\! \sum_{|\nu''|=0}^{\flbeta}
+\! \frac{\partial^{\nu'} \mu(x)}{\nu' !}
+\frac{\partial^{\nu''} f(x)}{\nu'' !}
+T_b(x)^{\nu' + \nu''\!\!\!}
+}{f(x)^{k+1} |T_b(x)|^{k+1}}
+\!\! \sum_{|u| = k}
+\! \binom{k}{u}
+\!\!
+\left(
+\frac{\partial^\nu f(x)}{\nu !}
+T_b(x)^\nu
+\right)^{\!\! u}
+\right]
+\! + O \! \left(
+\frac{1}{\lambda^\beta}
+\right) \\
+&\,=
+\sum_{|\nu'|=1}^{\flbeta}
+\sum_{|\nu''|=0}^{\flbeta}
+\sum_{|u|=0}^{\flbeta}
+\frac{\partial^{\nu'} \mu(x)}{\nu' !}
+\frac{\partial^{\nu''} f(x)}{\nu'' !}
+\left( \frac{\partial^\nu f(x)}{\nu !} \right)^u
+\binom{|u|}{u}
+\frac{(-1)^{|u|}}{f(x)^{|u|+1}}
+\E \left[
+\frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}}
+\right] \\
+&\quad+
+O \left(
+\frac{1}{\lambda^\beta}
+\right) .
+\end{align*}
+%
+We show this is a polynomial in $1/\lambda$.
+For $1 \leq j \leq d$, define
+$E_{1j*} \sim \Exp(1) \wedge (\lambda x_j)$
+and $E_{2j*} \sim \Exp(1) \wedge (\lambda (1-x_j))$
+independent so
+$T_b(x) = \prod_{j=1}^{d} [x_j - E_{1j*} / \lambda, x_j + E_{2j*} / \lambda]$.
+Then
+%
+\begin{align*}
+T_b(x)^\nu
+&=
+\int_{T_b(x)} (s-x)^\nu \diff s
+= \prod_{j=1}^d
+\int_{x_j - E_{1j*}/\lambda}^{x_j+E_{2j*}/\lambda}
+(s - x_j)^{\nu_j} \diff s
+= \prod_{j=1}^d
+\int_{-E_{1j*}}^{E_{2j*}} (s / \lambda)^{\nu_j} 1/\lambda \diff s \\
+&=
+\lambda^{-d - |\nu|}
+\prod_{j=1}^d
+\int_{-E_{1j*}}^{E_{2j*}} s^{\nu_j} \diff s
+= \lambda^{-d - |\nu|}
+\prod_{j=1}^d
+\frac{E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}}
+{\nu_j + 1}.
+\end{align*}
+%
+So by independence over $j$,
+%
+\begin{align}
+\label{eq:mondrian_app_bias_calc}
+&\E \left[
+\frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}}
+\right] \\
+\nonumber
+&\quad=
+\lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u}
+\prod_{j=1}^d
+\E \left[
+\frac{E_{2j*}^{\nu'_j + \nu''_j + 1}
++ (-1)^{\nu'_j + \nu''_j} E_{1j*}^{\nu'_j + \nu''_j + 1}}
+{(\nu'_j + \nu''_j + 1) (E_{2j*} + E_{1j*})}
+\frac{\left(E_{2j*}^{\nu_j + 1}
++ (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}\right)^u}
+{(\nu_j + 1)^u (E_{2j*} + E_{1j*})^{|u|}}
+\right].
+\end{align}
+%
+The final step is to replace $E_{1j*}$
+by $E_{1j} \sim \Exp(1)$ and similarly for $E_{2j*}$.
+For some $C > 0$,
+%
+\begin{align*}
+\P \! \left(
+\bigcup_{j=1}^{d}
+\left(
+\left\{
+E_{1j*} \neq E_{1j}
+\right\}
+\cup
+\left\{
+E_{2j*} \neq E_{2j}
+\right\}
+\right)
+\! \right)
+&\leq
+2d\,
+\P \! \left(
+\Exp(1) \geq \lambda \min_{1 \leq j \leq d}
+(x_j \wedge (1-x_j))
+\! \right)
+\leq
+2d e^{-C \lambda}.
+\end{align*}
+%
+Further, the quantity inside the expectation in
+\eqref{eq:mondrian_app_bias_calc}
+is bounded almost surely by one and so
+the error incurred by replacing
+$E_{1j*}$ and $E_{2j*}$ by $E_{1j}$ and $E_{2j}$
+in \eqref{eq:mondrian_app_bias_calc}
+is at most $2 d e^{-C \lambda} \lesssim \lambda^{-\beta}$.
+Thus the limiting bias is
+%
+\begin{align}
+\nonumber
+&\E \left[ \hat \mu(x) \right]
+- \mu(x) \\
+\nonumber
+&\quad=
+\sum_{|\nu'|=1}^{\flbeta}
+\sum_{|\nu''|=0}^{\flbeta}
+\sum_{|u|=0}^{\flbeta}
+\frac{\partial^{\nu'} \mu(x)}{\nu' !}
+\frac{\partial^{\nu''} f(x)}{\nu'' !}
+\left( \frac{\partial^\nu f(x)}{\nu !} \right)^u
+\binom{|u|}{u}
+\frac{(-1)^{|u|}}{f(x)^{|u|+1}}
+\, \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \\
+\nonumber
+&\qquad\quad\times
+\prod_{j=1}^d
+\E \left[
+\frac{E_{2j}^{\nu'_j + \nu''_j + 1}
++ (-1)^{\nu'_j + \nu''_j} E_{1j}^{\nu'_j + \nu''_j + 1}}
+{(\nu'_j + \nu''_j + 1) (E_{2j} + E_{1j})}
+\frac{\left(E_{2j}^{\nu_j + 1}
++ (-1)^{\nu_j} E_{1j}^{\nu_j + 1}\right)^u}
+{(\nu_j + 1)^u (E_{2j} + E_{1j})^{|u|}}
+\right] \\
+\label{eq:mondrian_app_bias}
+&\qquad+
+O \left( \frac{\log n}{\lambda} \frac{\lambda^d}{n} \right)
++ O \left( \frac{1}{\lambda^\beta} \right),
+\end{align}
+%
+recalling that $u$ is a multi-index which is indexed by the multi-index $\nu$.
+This is a polynomial in $\lambda$ of degree at most $\flbeta$,
+since higher-order terms can be absorbed into $O(1 / \lambda^\beta)$,
+which has finite coefficients depending only on
+the derivatives up to order $\flbeta$ of $f$ and $\mu$ at $x$.
+Now we show that the odd-degree terms in this polynomial are all zero.
+Note that a term is of odd degree if and only if
+$|\nu'| + |\nu''| + |\nu| \cdot u$ is odd.
+This implies that there exists $1 \leq j \leq d$ such that
+exactly one of either
+$\nu'_j + \nu''_j$ is odd or
+$\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd.
+
+If $\nu'_j + \nu''_j$ is odd, then
+$\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is even, so
+$|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is even.
+Consider the effect of swapping $E_{1j}$ and $E_{2j}$,
+an operation which preserves their joint law, in each of
+%
+\begin{align}
+\label{eq:mondrian_app_bias_odd_1}
+\frac{E_{2j}^{\nu'_j + \nu''_j + 1}
+- (-E_{1j})^{\nu'_j + \nu''_j + 1}}
+{E_{2j} + E_{1j}}
+\end{align}
+%
+and
+%
+\begin{align}
+\label{eq:mondrian_app_bias_odd_2}
+&\frac{\left(E_{2j}^{\nu_j + 1}
+- (-E_{1j})^{\nu_j + 1}\right)^u}
+{(E_{2j} + E_{1j})^{|u|}}
+= \!\!\!
+\prod_{\substack{|\nu| = 1 \\
+\nu_j u_\nu \text{ even}}}^\beta
+\!\!\!
+\frac{\left(E_{2j}^{\nu_j + 1}
+- (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}}
+{(E_{2j} + E_{1j})^{u_\nu}}
+\!\!\!
+\prod_{\substack{|\nu| = 1 \\
+\nu_j u_\nu \text{ odd}}}^\beta
+\!\!\!
+\frac{\left(E_{2j}^{\nu_j + 1}
+- (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}}
+{(E_{2j} + E_{1j})^{u_\nu}}.
+\end{align}
+%
+Clearly, $\nu'_j + \nu''_j$ being odd inverts the
+sign of \eqref{eq:mondrian_app_bias_odd_1}.
+For \eqref{eq:mondrian_app_bias_odd_2},
+each term in the first product has either
+$\nu_j$ even or $u_\nu$ even, so its sign is preserved.
+Every term in the second product of \eqref{eq:mondrian_app_bias_odd_2}
+has its sign inverted due to both $\nu_j$ and $u_\nu$ being odd,
+but there are an even number of terms,
+preserving the overall sign.
+Therefore the expected product
+of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2}
+is zero by symmetry.
+
+If however $\nu'_j + \nu''_j$ is even, then
+$\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd so
+$|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is odd.
+Clearly, the sign of \eqref{eq:mondrian_app_bias_odd_1} is preserved.
+Again the sign of the first product in \eqref{eq:mondrian_app_bias_odd_2}
+is preserved, and the sign of every term in \eqref{eq:mondrian_app_bias_odd_2}
+is inverted. However there are now an odd number of terms in the
+second product, so its overall sign is inverted.
+Therefore the expected product
+of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2}
+is again zero.
+
+\proofparagraph{calculating the second-order bias}
+
+Next we calculate some special cases, beginning with
+the form of the leading second-order bias,
+where the exponent in $\lambda$ is
+$|\nu'| + |\nu''| + u \cdot |\nu| = 2$,
+proceeding by cases on the values of $|\nu'|$, $|\nu''|$, and $|u|$.
+Firstly, if $|\nu'| = 2$ then $|\nu''| = |u| = 0$.
+Note that if any $\nu'_j = 1$ then the expectation in
+\eqref{eq:mondrian_app_bias} is zero.
+Hence we can assume $\nu'_j \in \{0, 2\}$, yielding
+%
+\begin{align*}
+\frac{1}{2 \lambda^2}
+\! \sum_{j=1}^d
+\frac{\partial^2 \mu(x)}{\partial x_j^2}
+\frac{1}{3}
+\E \! \left[
+\frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}}
+\right]
+&\!=
+\frac{1}{2 \lambda^2}
+\! \sum_{j=1}^d
+\frac{\partial^2 \mu(x)}{\partial x_j^2}
+\frac{1}{3}
+\E \! \left[
+E_{1j}^{2}
++ E_{2j}^{2}
+- E_{1j} E_{2j}
+\right]
+= \frac{1}{2 \lambda^2}
+\! \sum_{j=1}^d
+\frac{\partial^2 \mu(x)}{\partial x_j^2},
+\end{align*}
+%
+where we used that $E_{1j}$ and $E_{2j}$ are independent $\Exp(1)$.
+Next we consider $|\nu'| = 1$ and $|\nu''| = 1$, so $|u| = 0$.
+Note that if $\nu'_j = \nu''_{j'} = 1$ with $j \neq j'$ then the
+expectation in \eqref{eq:mondrian_app_bias} is zero.
+So we need only consider $\nu'_j = \nu''_j = 1$, giving
+%
+\begin{align*}
+\frac{1}{\lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}
+\frac{1}{3}
+\E \left[
+\frac{E_{2j}^{3} + E_{1j}^{3}}
+{E_{2j} + E_{1j}}
+\right]
+&=
+\frac{1}{\lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}.
+\end{align*}
+%
+Finally, we have the case where $|\nu'| = 1$, $|\nu''| = 0$
+and $|u|=1$. Then $u_\nu = 1$ for some $|\nu| = 1$ and zero otherwise.
+Note that if $\nu'_j = \nu_{j'} = 1$ with $j \neq j'$ then the
+expectation is zero. So we need only consider $\nu'_j = \nu_j = 1$, giving
+%
+\begin{align*}
+&- \frac{1}{\lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}
+\frac{1}{4}
+\E \left[
+\frac{(E_{2j}^2 - E_{1j}^2)^2}
+{(E_{2j} + E_{1j})^2}
+\right] \\
+&\quad=
+- \frac{1}{4 \lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}
+\E \left[
+E_{1j}^2
++ E_{2j}^2
+- 2 E_{1j} E_{2j}
+\right]
+=
+- \frac{1}{2 \lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}.
+\end{align*}
+%
+Hence the second-order bias term is
+%
+\begin{align*}
+\frac{1}{2 \lambda^2}
+\sum_{j=1}^d
+\frac{\partial^2 \mu(x)}{\partial x_j^2}
++ \frac{1}{2 \lambda^2}
+\frac{1}{f(x)}
+\sum_{j=1}^{d}
+\frac{\partial \mu(x)}{\partial x_j}
+\frac{\partial f(x)}{\partial x_j}.
+\end{align*}
+
+\proofparagraph{calculating the bias if the data is uniformly distributed}
+
+If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$ and
+the bias expansion from \eqref{eq:mondrian_app_bias} becomes
+%
+\begin{align*}
+\sum_{|\nu'|=1}^{\flbeta}
+\lambda^{- |\nu'|}
+\frac{\partial^{\nu'} \mu(x)}{\nu' !}
+\prod_{j=1}^d
+\E \left[
+\frac{E_{2j}^{\nu'_j + 1}
++ (-1)^{\nu'_j} E_{1j}^{\nu'_j + 1}}
+{(\nu'_j + 1) (E_{2j} + E_{1j})}
+\right].
+\end{align*}
+%
+This is zero if any $\nu_j'$ is odd,
+so we group these terms based on the exponent of $\lambda$ to see
+%
+\begin{align*}
+\frac{B_r(x)}{\lambda^{2r}}
+&=
+\frac{1}{\lambda^{2r}}
+\sum_{|\nu|=r}
+\frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !}
+\prod_{j=1}^d
+\frac{1}{2\nu_j + 1}
+\E \left[
+\frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}}
+{E_{2j} + E_{1j}}
+\right].
+\end{align*}
+%
+Since $\int_0^\infty \frac{e^{-t}}{a+t} \diff t = e^a \Gamma(0,a)$
+and $\int_0^\infty s^a \Gamma(0, a) \diff s = \frac{a!}{a+1}$,
+with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$
+the upper incomplete gamma function,
+the expectation is easily calculated as
+%
+\begin{align*}
+\E \left[
+\frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}}
+{E_{2j} + E_{1j}}
+\right]
+&=
+2
+\int_{0}^{\infty}
+s^{2\nu_j + 1}
+e^{-s}
+\int_{0}^{\infty}
+\frac{e^{-t}}
+{s + t}
+\diff t
+\diff s \\
+&=
+2 \int_{0}^{\infty}
+s^{2\nu_j + 1}
+\Gamma(0, s)
+\diff s
+=
+\frac{(2 \nu_j + 1)!}{\nu_j + 1},
+\end{align*}
+%
+so finally
+%
+\begin{align*}
+\frac{B_r(x)}{\lambda^{2r}}
+&=
+\frac{1}{\lambda^{2r}}
+\sum_{|\nu|=r}
+\frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !}
+\prod_{j=1}^d
+\frac{1}{2\nu_j + 1}
+\frac{(2 \nu_j + 1)!}{\nu_j + 1}
+=
+\frac{1}{\lambda^{2r}}
+\sum_{|\nu|=r}
+\partial^{2 \nu} \mu(x)
+\prod_{j=1}^d
+\frac{1}{\nu_j + 1}.
+\end{align*}
+%
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation}]
+This follows from the debiased version in
+Theorem~\ref{thm:mondrian_variance_estimation_debiased}
+with $J=0$, $a_0 = 1$, and $\omega_0 = 1$.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_confidence}]
+%
+By Theorem~\ref{thm:mondrian_bias}
+and Theorem~\ref{thm:mondrian_variance_estimation},
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu(x) - \mu(x)}{\hat \Sigma(x)^{1/2}}
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]}
+{\hat \Sigma(x)^{1/2}}
++ \sqrt{\frac{n}{\lambda^d}}
+\frac{\E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x)}
+{\hat \Sigma(x)^{1/2}} \\
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]}
+{\hat \Sigma(x)^{1/2}}
++ \sqrt{\frac{n}{\lambda^d}} \,
+O_\P \left(
+\frac{1}{\lambda^{\beta \wedge 2}}
++ \frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{align*}
+%
+The first term now converges weakly to $\cN(0,1)$ by
+Slutsky's theorem, Theorem~\ref{thm:mondrian_clt},
+and Theorem~\ref{thm:mondrian_variance_estimation},
+while the second term is $o_\P(1)$ by assumption.
+Validity of the confidence interval follows immediately.
+%
+\end{proof}
+
+\subsection{Debiased Mondrian random forests}
+
+We give rigorous proofs of the central limit theorem,
+bias characterization, variance estimation,
+confidence interval validity, and minimax optimality
+results for the debiased Mondrian random forest estimator.
+
+\begin{proof}[Theorem~\ref{thm:mondrian_clt_debiased}]
+
+We use the martingale central limit theorem given by
+\citet[Theorem~3.2]{hall1980martingale}.
+For each $1 \leq i \leq n$ define
+$\cH_{n i}$ to be the filtration
+generated by $\bT$, $\bX$, and
+$(\varepsilon_j : 1 \leq j \leq i)$,
+noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$
+because $B$ increases weakly as $n$ increases.
+Let $\I_{i b r}(x) = \I\{X_i \in T_{b r}(x)\}$
+where $T_{b r}(x)$ is the cell containing $x$ in tree $b$
+used to construct $\hat \mu_r(x)$,
+and similarly let $N_{b r}(x) = \sum_{i=1}^n \I_{i b r}(x)$
+and $\I_{b r}(x) = \I\{N_{b r}(x) \geq 1\}$.
+Define the $\cH_{n i}$-measurable and square integrable
+variables
+%
+\begin{align*}
+S_i(x)
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+\omega_r
+\frac{1}{B} \sum_{b=1}^B
+\frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)},
+\end{align*}
+%
+which satisfy the martingale
+difference property
+$\E [ S_i(x) \mid \cH_{n i} ] = 0$.
+Further,
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\big(
+\hat\mu_\rd(x)
+- \E\left[
+\hat\mu_\rd(x) \mid \bX, \bT
+\right]
+\big)
+= \sum_{i=1}^n S_i(x).
+\end{align*}
+%
+By \citet[Theorem~3.2]{hall1980martingale}
+it suffices to check that
+%
+\begin{inlineroman}
+\item $\max_i |S_i(x)| \to 0$ in probability,%
+\label{it:mondrian_app_hall_prob}
+\item $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and%
+\label{it:mondrian_app_hall_exp}
+\item $\sum_i S_i(x)^2 \to \Sigma_\rd(x)$ in probability.
+\label{it:mondrian_app_hall_var}
+\end{inlineroman}
+
+\proofparagraph{checking condition \ref{it:mondrian_app_hall_prob}}
+%
+Since $J$ is fixed and
+$\E[|\varepsilon_i|^3 \mid X_i]$ is bounded,
+by Jensen's inequality and
+Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
+%
+\begin{align*}
+\E\left[\max_{1 \leq i \leq n} |S_i(x)| \right]
+&=
+\E\left[\max_{1 \leq i \leq n}
+\left|
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+\omega_r
+\frac{1}{B} \sum_{b=1}^B
+\frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
+\right|
+\right] \\
+&\leq
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+|\omega_r|
+\frac{1}{B}
+\E\left[\max_{1 \leq i \leq n}
+\left|
+\sum_{b=1}^B
+\frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
+\right|
+\right] \\
+&\leq
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+|\omega_r|
+\frac{1}{B}
+\E\left[
+\sum_{i=1}^{n}
+\left(
+\sum_{b=1}^B
+\frac{\I_{i b r}(x) |\varepsilon_i|} {N_{b r}(x)}
+\right)^3
+\right]^{1/3} \\
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+|\omega_r|
+\frac{1}{B}
+\E\left[
+\sum_{i=1}^{n}
+|\varepsilon_i|^3
+\sum_{b=1}^B
+\sum_{b'=1}^B
+\sum_{b''=1}^B
+\frac{\I_{i b r}(x) } {N_{b r}(x)}
+\frac{\I_{i b' r}(x) } {N_{b' r}(x)}
+\frac{\I_{i b'' r}(x) } {N_{b'' r}(x)}
+\right]^{1/3} \\
+&\lesssim
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+|\omega_r|
+\frac{1}{B^{2/3}}
+\E\left[
+\sum_{b=1}^B
+\sum_{b'=1}^B
+\frac{\I_{b r}(x)} {N_{b r}(x)}
+\frac{\I_{b' r}(x)} {N_{b' r}(x)}
+\right]^{1/3} \\
+&\lesssim
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+|\omega_r|
+\frac{1}{B^{2/3}}
+\left(
+B^2 \frac{a_r^{2d} \lambda^{2d}}{n^2}
++ B \frac{a_r^{2d} \lambda^{2d} \log n}{n^2}
+\right)^{1/3} \\
+&\lesssim
+\left( \frac{\lambda^d}{n} \right)^{1/6}
++ \left( \frac{\lambda^d}{n} \right)^{1/6}
+\left( \frac{\log n}{B} \right)^{1/3}
+\to 0.
+\end{align*}
+
+\proofparagraph{checking condition \ref{it:mondrian_app_hall_exp}}
+%
+Since $\E[\varepsilon_i^2 \mid X_i]$ is bounded
+and by Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
+%
+\begin{align*}
+\E\left[\max_{1 \leq i \leq n} S_i(x)^2 \right]
+&=
+\E\left[
+\max_{1 \leq i \leq n}
+\left(
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+\omega_r
+\frac{1}{B} \sum_{b=1}^B
+\frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
+\right)^2
+\right] \\
+&\leq
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+(J+1)^2
+\max_{0 \leq r \leq J}
+\omega_r^2
+\,\E\left[
+\sum_{i=1}^{n}
+\sum_{b=1}^B
+\sum_{b'=1}^B
+\frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r}(x)}
+\right] \\
+&\lesssim
+\frac{n}{\lambda^d}
+\max_{0 \leq r \leq J}
+\E\left[
+\frac{\I_{b r}(x)}{N_{b r}(x)}
+\right]
+\lesssim
+\frac{n}{\lambda^d}
+\max_{0 \leq r \leq J}
+\frac{a_r^d \lambda^d}{n}
+\lesssim 1.
+\end{align*}
+
+\proofparagraph{checking condition \ref{it:mondrian_app_hall_var}}
+
+Next, we have
+%
+\begin{align}
+\label{eq:mondrian_app_clt_condition_sum}
+\sum_{i=1}^n
+S_i(x)^2
+&=
+\sum_{i=1}^n
+\left(
+\sqrt{\frac{n}{\lambda^d}}
+\sum_{r=0}^{J}
+\omega_r
+\frac{1}{B} \sum_{b=1}^B
+\frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}
+\right)^2 \\
+&=
+\nonumber
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\sum_{b=1}^B
+\sum_{b'=1}^B
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)} \\
+\nonumber
+&=
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\sum_{b=1}^B
+\left(
+\frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b r'}(x)}
++ \sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right).
+\end{align}
+%
+By boundedness of $\E[\varepsilon_i^2 \mid X_i]$
+and Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
+the first term in \eqref{eq:mondrian_app_clt_condition_sum}
+vanishes as
+%
+\begin{align*}
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\sum_{b=1}^B
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b r'}(x)}
+\right]
+&\lesssim
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\max_{0 \leq r \leq J}
+\sum_{b=1}^B
+\E \left[
+\frac{\I_{b r}(x)}{N_{b r}(x)}
+\right]
+\lesssim
+\frac{1}{B}
+\to 0.
+\end{align*}
+%
+For the second term in \eqref{eq:mondrian_app_clt_condition_sum},
+the law of total variance gives
+%
+\begin{align}
+\nonumber
+&\Var \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+\nonumber
+&\quad\leq
+(J+1)^4
+\max_{0 \leq r, r' \leq J}
+\omega_r
+\omega_{r'}
+\Var \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+\nonumber
+&\quad\lesssim
+\max_{0 \leq r, r' \leq J}
+\E \left[
+\Var \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+\label{eq:mondrian_app_total_variance}
+&\qquad+
+\max_{0 \leq r, r' \leq J}
+\Var \left[
+\E \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right]
+\end{align}
+%
+For the first term in \eqref{eq:mondrian_app_total_variance},
+%
+\begin{align*}
+&\E \left[
+\Var \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+&\quad=
+\frac{n^2}{\lambda^{2d}}
+\frac{1}{B^4}
+\sum_{i=1}^n
+\sum_{j=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\sum_{\tilde b=1}^B
+\sum_{\tilde b' \neq \tilde b}
+\E \Bigg[
+\varepsilon_i^2
+\varepsilon_j^2
+\left(
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) }
+{N_{b r}(x) N_{b' r'}(x)}
+- \E
+\left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) }
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX
+\right]
+\right) \\
+&\qquad\quad
+\times
+\left(
+\frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) }
+{N_{\tilde b r}(x) N_{ \tilde b' r'}(x)}
+- \E
+\left[
+\frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) }
+{N_{\tilde b r}(x) N_{\tilde b' r'}(x)}
+\Bigm| \bX
+\right]
+\right)
+\Bigg].
+\end{align*}
+%
+Since $T_{b r}$ is independent of $T_{b' r'}$ given
+$\bX, \bY$, the summands are zero
+whenever $\big|\{b, b', \tilde b, \tilde b'\}\big| = 4$.
+Since $\E[ \varepsilon_i^2 \mid X_i]$ is bounded
+and by the Cauchy--Schwarz inequality
+and Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
+%
+\begin{align*}
+&\E \left[
+\Var \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+&\quad\lesssim
+\frac{n^2}{\lambda^{2d}}
+\frac{1}{B^3}
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\E \left[
+\left(
+\sum_{i=1}^n
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) }
+{N_{b r}(x) N_{b' r'}(x)}
+\right)^2
+\right]
+\lesssim
+\frac{n^2}{\lambda^{2d}}
+\frac{1}{B}
+\E \left[
+\frac{\I_{b r}(x)}{N_{b r}(x)}
+\frac{\I_{b' r'}(x)}{N_{b' r'}(x)}
+\right]
+\lesssim
+\frac{1}{B}
+\to 0.
+\end{align*}
+%
+For the second term in \eqref{eq:mondrian_app_total_variance},
+the random variable inside the variance is a nonlinear
+function of the i.i.d.\ variables $(X_i, \varepsilon_i)$,
+so we apply the Efron--Stein inequality
+\citep{efron1981jackknife}.
+Let $(\tilde X_{i j}, \tilde Y_{i j}) = (X_i, Y_i)$
+if $i \neq j$ and be an
+independent copy of $(X_j, Y_j)$,
+denoted $(\tilde X_j, \tilde Y_j)$, if $i = j$,
+and define $\tilde \varepsilon_{i j} = \tilde Y_{i j} - \mu(\tilde X_{i j})$.
+Write
+$\tilde \I_{i j b r}(x) = \I \big\{ \tilde X_{i j} \in T_{b r}(x) \big\}$
+and
+$\tilde \I_{j b r}(x) = \I \big\{ \tilde X_{j} \in T_{b r}(x) \big\}$,
+and also
+$\tilde N_{j b r}(x) = \sum_{i=1}^{n} \tilde \I_{i j b r}(x)$.
+We use the leave-one-out notation
+$N_{-j b r}(x) = \sum_{i \neq j} \I_{i b r}(x)$
+and also write
+$N_{-j b r \cap b' r'} = \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x)$.
+Since $\E[ \varepsilon_i^4 \mid X_i]$ is bounded,
+%
+\begin{align*}
+&\Var \left[
+\E \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+&\quad\leq
+\Var \left[
+\E \left[
+\frac{n}{\lambda^d}
+\sum_{i=1}^n
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+&\quad\leq
+\frac{1}{2}
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+\left(
+\sum_{i=1}^n
+\left(
+\frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+- \frac{\tilde \I_{i j b r}(x) \tilde \I_{i j b' r'}(x)
+\tilde \varepsilon_{i j}^2}
+{\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
+\right)
+\right)^2
+\right] \\
+&\quad\leq
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+\left(
+\left|
+\frac{1}
+{N_{b }(x) N_{b' r'}(x)}
+- \frac{1}
+{\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
+\right|
+\sum_{i \neq j}
+\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2
+\right)^2
+\right] \\
+&\qquad+
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+\left(
+\left(
+\frac{\I_{j b r}(x) \I_{j b' r'}(x) \varepsilon_j^2}
+{N_{b r}(x) N_{b' r'}(x)}
+- \frac{\tilde \I_{j b r}(x) \tilde \I_{j b' r'}(x)
+\tilde \varepsilon_j^2}
+{\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
+\right)
+\right)^2
+\right] \\
+&\quad\lesssim
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+N_{-j b r \cap b' r}(x)^2
+\left|
+\frac{1}
+{N_{b r}(x) N_{b' r'}(x)}
+- \frac{1}
+{\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
+\right|^2
++ \frac{\I_{j b r}(x) \I_{j b' r'}(x)}
+{N_{b r}(x)^2 N_{b' r'}(x)^2}
+\right].
+\end{align*}
+%
+For the first term in the above display, note that
+%
+\begin{align*}
+&\left|
+\frac{1}{N_{b r}(x) N_{b' r'}(x)}
+- \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)}
+\right| \\
+&\quad\leq
+\frac{1}{N_{b r}(x)}
+\left|
+\frac{1} {N_{b' r'}(x)} - \frac{1} {\tilde N_{j b' r'}(x)}
+\right|
++ \frac{1}{\tilde N_{j b' r'}(x)}
+\left|
+\frac{1} {N_{b r}(x)} - \frac{1} {\tilde N_{j b r}(x)}
+\right| \\
+&\quad\leq
+\frac{1}{N_{-j b r}(x)}
+\frac{1} {N_{-j b' r'}(x)^2}
++ \frac{1}{N_{-j b' r'}(x)}
+\frac{1} {N_{-j b r}(x)^2}
+\end{align*}
+%
+since $|N_{b r}(x) - \tilde N_{j b r}(x)| \leq 1$
+and $|N_{b' r'}(x) - \tilde N_{j b' r'}(x)| \leq 1$.
+Further, these terms are non-zero only on the events
+$\{ X_j \in T_{b r}(x) \} \cup \{ \tilde X_j \in T_{b r}(x) \}$
+and $\{ X_j \in T_{b' r'}(x) \} \cup \{ \tilde X_j \in T_{b' r'}(x) \}$
+respectively, so
+%
+\begin{align*}
+&\Var \left[
+\E \left[
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{b=1}^B
+\sum_{b' \neq b}
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\Bigm| \bX, \bY
+\right]
+\right] \\
+&\, \lesssim
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+\frac{\I_{j b' r'}(x) + \tilde \I_{j b' r'}(x)}{N_{-j b r}(x)^2}
+\frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b' r'}(x)^4}
+\right. \\
+&\left.
+\qquad+
+\frac{\I_{j b r}(x) + \tilde \I_{j b r}(x)}{N_{-j b' r'}(x)^2}
+\frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b r}(x)^4}
++
+\frac{\I_{j b r}(x) \I_{j b' r'}(x)}
+{N_{b r}(x)^2 N_{b' r'}(x)^2}
+\right] \\
+&\, \lesssim
+\frac{n^2}{\lambda^{2d}}
+\sum_{j=1}^{n}
+\E \left[
+\frac{\I_{j b r}(x) \I_{b r}(x) \I_{b' r'}(x)}
+{N_{b r}(x)^2 N_{b' r'}(x)^2}
+\right]
+\lesssim
+\frac{n^2}{\lambda^{2d}}
+\E \left[
+\frac{\I_{b r}(x) \I_{b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)^2}
+\right] \\
+&\lesssim
+\frac{n^2}{\lambda^{2d}}
+\frac{\lambda^d}{n}
+\frac{\lambda^{2d} \log n}{n^2}
+\lesssim
+\frac{\lambda^d \log n}{n}
+\to 0,
+\end{align*}
+%
+where we used Lemma~\ref{lem:mondrian_app_simple_moment_denominator}.
+So
+$\sum_{i=1}^{n} S_i(x)^2 - n \,\E \left[ S_i(x)^2 \right]
+= O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right)
+= o_\P(1)$.
+
+\proofparagraph{calculating the limiting variance}
+%
+Thus by \citet[Theorem~3.2]{hall1980martingale}
+we conclude that
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\big(
+\hat\mu_\rd(x)
+- \E\left[
+\hat\mu_\rd(x) \mid \bX, \bT
+\right]
+\big)
+&\rightsquigarrow
+\cN\big(0, \Sigma_\rd(x)\big)
+\end{align*}
+%
+as $n \to \infty$, assuming that the limit
+%
+\begin{align*}
+\Sigma_\rd(x)
+&=
+\lim_{n \to \infty}
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+\end{align*}
+%
+exists. Now we verify this and calculate the limit.
+Since $J$ is fixed, it suffices to find
+%
+\begin{align*}
+\lim_{n \to \infty}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+\end{align*}
+%
+for each $0 \leq r, r' \leq J$.
+Firstly, note that
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \sigma^2(X_i)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+&=
+\frac{n^2}{\lambda^d}
+\sigma^2(x)
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+&\quad+
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)
+\big(\sigma^2(X_i) - \sigma^2(x) \big)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right].
+\end{align*}
+%
+Since $\sigma^2$ is Lipschitz and
+$\P \left(\max_{1 \leq l \leq d}
+|T_b(x)_l| \geq t/\lambda \right) \leq 2d e^{-t/2}$
+by Lemma~\ref{lem:mondrian_app_largest_cell},
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)
+\big|\sigma^2(X_i) - \sigma^2(x) \big|}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&\leq
+2de^{-t/2}
+\frac{n^2}{\lambda^d}
++ \frac{n^2}{\lambda^d}
+\frac{t}{\lambda}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+&\lesssim
+\frac{n^2}{\lambda^d}
+\frac{\log n}{\lambda}
+\frac{\lambda^d}{n^2}
+\lesssim
+\frac{\log n}{\lambda},
+\end{align*}
+%
+by Lemma~\ref{lem:mondrian_app_simple_moment_denominator},
+where we set $t = 4 \log n$.
+Therefore
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\sigma^2(x)
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
++ O \left( \frac{\log n}{\lambda} \right).
+\end{align*}
+%
+Next, by conditioning on
+$T_{b r}$, $T_{b' r'}$, $N_{-i b r}(x)$, and $N_{-i b' r'}(x)$,
+%
+\begin{align*}
+&\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+= \E \left[
+\frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} f(\xi) \diff \xi}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right] \\
+&\quad= f(x) \,
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right]
++
+\E \left[
+\frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)}
+(f(\xi) - f(x)) \diff \xi}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right] \\
+&\quad=
+f(x) \,
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right]
++ O \left(
+\frac{\lambda^d}{n^2}
+\frac{(\log n)^{d+1}}{\lambda}
+\right)
+\end{align*}
+%
+arguing using Lemma~\ref{lem:mondrian_app_largest_cell},
+the Lipschitz property of $f(x)$,
+and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \! \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\sigma^2(x)
+f(x)
+\frac{n^2}{\lambda^d}
+\E \! \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right]
+\! + O \! \left(
+\frac{(\log n)^{d+1}}{\lambda}
+\right).
+\end{align*}
+%
+Now we apply the binomial result in
+Lemma~\ref{lem:mondrian_app_binomial_expectation}
+to approximate the expectation. With
+$N_{-i b' r' \setminus b r}(x) =
+\sum_{j \neq i} \I\{X_j \in T_{b' r'}(x) \setminus T_{b r}(x)\}$,
+%
+\begin{align*}
+&\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right]
+= \E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{N_{-i b r}(x)+1}
+\right. \\
+&\qquad\left.
+\times \,
+\E \left[
+\frac{1}
+{N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1}
+\Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x)
+\right]
+\right].
+\end{align*}
+%
+Now conditional on
+$\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$,
+%
+\begin{align*}
+N_{-i b' r' \setminus b r}(x)
+&\sim \Bin\left(
+n - 1 - N_{-i b r}(x), \
+\frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi}
+{1 - \int_{T_{b r}(x)}
+f(\xi) \diff \xi}
+\right).
+\end{align*}
+%
+We bound these parameters above and below.
+Firstly, by Lemma~\ref{lem:mondrian_app_active_data} with $B=1$,
+%
+\begin{align*}
+\P \left( N_{-i b r}(x) >
+t^{d+1}
+\frac{n}{\lambda^d}
+\right)
+&\leq
+4 d e^{- t / (4 \|f\|_\infty(1 + 1/a_r))}
+\leq
+e^{- t / C}
+\end{align*}
+%
+for some $C > 0$ and sufficiently large $t$.
+Next, if $f$ is $L$-Lipschitz in $\ell^2$,
+by Lemma~\ref{lem:mondrian_app_largest_cell},
+%
+\begin{align*}
+&\P \left(
+\left|
+\frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi}
+{1 - \int_{T_{b r}(x)} f(\xi)
+\diff \xi}
+- f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|
+\right|
+> t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{\lambda}
+\right) \\
+&\quad\leq
+\P \left(
+\int_{T_{b' r'}(x) \setminus T_{b r}(x)}
+\left| f(\xi) - f(x) \right|
+\diff \xi
+> t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2 \lambda}
+\right) \\
+&\qquad+
+\P \left(
+\frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi
+\cdot \int_{T_{b r}(x)} f(\xi) \diff \xi}
+{1 - \int_{T_{b r}(x)} f(\xi) \diff \xi}
+> t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
+\right) \\
+&\quad\leq
+\P \left(
+L d\,
+|T_{b' r'}(x) \setminus T_{b r}(x)|
+\max_{1 \leq j \leq d} |T_{b' r'}(x)_j|
+> t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
+\right) \\
+&\qquad+
+\P \left(
+\|f\|_\infty
+\,|T_{b' r'}(x) \setminus T_{b r}(x)|
+\frac{\|f\|_\infty |T_{b r}(x)|}
+{1 - \|f\|_\infty |T_{b r}(x)|}
+> t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda}
+\right) \\
+&\quad\leq
+\P \left(
+\max_{1 \leq j \leq d} |T_{b' r'}(x)_j|
+> \frac{t}{2\lambda L d}
+\right)
++\P \left(
+|T_{b r}(x)|
+> \frac{t}{4\lambda \|f\|_\infty^2}
+\right) \\
+&\quad\leq
+2 d e^{-t a_r /(4L d)}
++ 2 d e^{-t a_r / (8 \|f\|_\infty^2)}
+\leq e^{-t/C},
+\end{align*}
+%
+for large $t$,
+increasing $C$ as necessary.
+Thus with probability at least $1 - e^{-t/C}$,
+increasing $C$,
+%
+\begin{align*}
+N_{-i b' r' \setminus b r}(x)
+&\leq \Bin\left(
+n, \,
+|T_{b' r'}(x) \setminus T_{b r}(x)|
+\left( f(x) + \frac{t}{\lambda} \right)
+\right) \\
+N_{-i b' r' \setminus b r}(x)
+&\geq
+\Bin\left(
+n
+\left( 1 - \frac{t^{d+1}}{\lambda^d}
+- \frac{1}{n} \right), \,
+|T_{b' r'}(x) \setminus T_{b r}(x)|
+\left( f(x) - \frac{t}{\lambda} \right)
+\right).
+\end{align*}
+%
+So by Lemma~\ref{lem:mondrian_app_binomial_expectation} conditionally on
+$\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$,
+we have with probability at least $1 - e^{-t/C}$ that
+%
+\begin{align*}
+&\left|
+\E \left[
+\frac{1}
+{N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1}
+\Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x)
+\right]
+\right.
+\\
+&\left.
+\qquad-
+\frac{1}
+{N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
+\right| \\
+&\quad\lesssim
+\frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
+{\left(N_{-i b' r' \cap b r}(x)
++ n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}.
+\end{align*}
+%
+Therefore, by the same approach as the proof of
+Lemma~\ref{lem:mondrian_app_moment_denominator},
+taking $t = 3 C \log n$,
+%
+\begin{align*}
+&
+\left|
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)}
+\right.\right. \\
+&\left.\left.
+\qquad -
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(N_{-i b r}(x)+1)
+(N_{-i b' r' \cap b r}(x)+n f(x)
+|T_{b' r'}(x) \setminus T_{b r}(x)|+1)}
+\right]
+\right| \\
+&\quad\lesssim
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1}
+\frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
+{\left(N_{-i b' r' \cap b r}(x)
++ n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}
+\right]
++
+e^{-t/C} \\
+&\quad\lesssim
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{n |T_{b r}(x)|+1}
+\frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|}
+{(n |T_{b' r'}(x)| + 1)^2}
+\right]
++ e^{-t/C} \\
+&\quad\lesssim
+\E \left[
+\frac{1}{n}
+\frac{1}
+{(n |T_{b' r'}(x)| + 1)^2}
++ \frac{1}{n}
+\frac{t / \lambda}
+{n |T_{b' r'}(x)| + 1}
+\right]
++ e^{-t/C} \\
+&\quad\lesssim
+\frac{\lambda^{2d} \log n}{n^3}
++ \frac{\log n}{n \lambda}
+\frac{\lambda^d}{n}
+\lesssim
+\frac{\lambda^d}{n^2}
+\left(
+\frac{\lambda^{d} \log n}{n}
++ \frac{\log n}{\lambda}
+\right).
+\end{align*}
+%
+Now apply the same argument to the other
+term in the expectation, to see that
+%
+\begin{align*}
+&\left|
+\E \left[
+\frac{1}
+{N_{-i b r \cap b' r'}(x)+N_{-i b r \setminus b' r'}(x)+1}
+\Bigm| \bT, N_{-i b r \cap b' r'}(x), N_{-i b' r' \setminus b r}(x)
+\right]
+\right. \\
+&\left.
+\qquad-
+\frac{1}
+{N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
+\right| \\
+&\quad\lesssim
+\frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|}
+{\left(N_{-i b r \cap b' r'}(x)
++ n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}.
+\end{align*}
+%
+with probability at least $1 - e^{-t/C}$,
+and so likewise again with $t = 3 C \log n$,
+%
+\begin{align*}
+&\frac{n^2}{\lambda^d}
+\left|
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1}
+\frac{1}
+{N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
+\right]
+\right.
+\\
+&\left.
+\quad-
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
+\right.\right. \\
+&\qquad\qquad\left.\left.
+\times
+\frac{1}
+{N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
+\right]
+\right| \\
+&\lesssim
+\frac{n^2}{\lambda^d} \,
+\E \left[
+\frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|}
+{\left(N_{-i b r \cap b' r'}(x)
++ n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}
+\right. \\
+&\qquad\qquad\left.
+\times
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
+\right]
++ \frac{n^2}{\lambda^d}
+e^{-t/C} \\
+&\lesssim
+\frac{\lambda^d \log n}{n}
++ \frac{\log n}{\lambda}.
+\end{align*}
+%
+Thus far we have proven that
+%
+\begin{align*}
+&\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+= \sigma^2(x)
+f(x)
+\frac{n^2}{\lambda^d} \\
+&\quad\times
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1}
+\right. \\
+&\left.
+\qquad\qquad
+\times
+\frac{1}
+{N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1}
+\right] \\
+&\quad+
+O \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{\lambda^d \log n}{n}
+\right).
+\end{align*}
+%
+We remove the $N_{-i b r \cap b' r'}(x)$ terms.
+With probability at least $1 - e^{-t/C}$, conditional on $\bT$,
+%
+\begin{align*}
+N_{-i b r \cap b' r'}(x)
+&\leq \Bin\left(
+n, \,
+|T_{b r}(x) \cap T_{b' r'}(x)|
+\left( f(x) + \frac{t}{\lambda} \right)
+\right), \\
+N_{-i b r \cap b' r'}(x)
+&\geq
+\Bin\left(
+n
+\left( 1 - \frac{t^{d+1}}{\lambda^d}
+- \frac{1}{n} \right), \,
+|T_{b r}(x) \cap T_{b' r'}(x)|
+\left( f(x) - \frac{t}{\lambda} \right)
+\right).
+\end{align*}
+%
+Therefore, by Lemma~\ref{lem:mondrian_app_binomial_expectation}
+applied conditionally on $\bT$,
+with probability at least $1 - e^{-t/C}$,
+%
+\begin{align*}
+&
+\left|
+\E \! \left[
+\frac{1}
+{N_{-i b r \cap b' r'}(x)
++ n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1}
+\frac{1}
+{N_{-i b' r' \cap b r}(x)
++ n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1}
+\! \Bigm| \! \bT
+\right]
+\right.
+\\
+&\left.
+\qquad-
+\frac{1}
+{n f(x) |T_{b r}(x)|+1}
+\frac{1}
+{n f(x) |T_{b' r'}(x)|+1}
+\right| \\
+&\quad\lesssim
+\frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
+{(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)}
+\left(
+\frac{1}{n |T_{b r}(x)| + 1}
++ \frac{1}{n |T_{b' r'}(x)| + 1}
+\right).
+\end{align*}
+%
+Now by Lemma~\ref{lem:mondrian_app_moment_cell},
+with $t = 3 C \log n$,
+%
+\begin{align*}
+&\frac{n^2}{\lambda^d}
+\left|
+\E \! \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{N_{-i b r \cap b' r'}(x)
++ n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1}
+\frac{1}
+{N_{-i b' r' \cap b r}(x)
++ n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1}
+\right]
+\right. \\
+&\left.
+\qquad-
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{n f(x) |T_{b r}(x)|+1}
+\frac{1}
+{n f(x) |T_{b' r'}(x)|+1}
+\right]
+\right| \\
+&\quad\lesssim
+\frac{n^2}{\lambda^d}
+\E \left[
+|T_{b r}(x) \cap T_{b' r'}(x)|
+\frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
+{(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)}
+\frac{1}{n |T_{b r}(x)| + 1}
++ \frac{1}{n |T_{b' r'}(x)| + 1}
+\right] \\
+&\qquad+
+\frac{n^2}{\lambda^d}
+e^{-t/C} \\
+&\quad\lesssim
+\frac{n^2}{\lambda^d}
+\frac{1}{n^3}
+\E \left[
+\frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|}
+{|T_{b r}(x)| |T_{b' r'}(x)|}
+\right]
++ \frac{n^2}{\lambda^d}
+e^{-t/C} \\
+&\quad\lesssim
+\frac{1}{n \lambda^d}
+\E \left[
+\frac{1}{|T_{b r}(x)| |T_{b' r'}(x)|}
+\right]
++ \frac{t}{\lambda^{d+1}}
+\E \left[
+\frac{1}{|T_{b r}(x)|}
+\right]
++ \frac{n^2}{\lambda^d}
+e^{-t/C} \\
+&\quad\lesssim
+\frac{\lambda^d}{n}
++ \frac{\log n}{\lambda}.
+\end{align*}
+%
+This allows us to deduce that
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\sigma^2(x)
+f(x)
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{(n f(x) |T_{b r}(x)|+1)(n f(x) |T_{b' r'}(x)|+1)}
+\right] \\
+&\quad+
+O \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{\lambda^d \log n}{n}
+\right).
+\end{align*}
+%
+Now that we have reduced the limiting variance to an expression
+only involving the sizes of Mondrian cells,
+we can exploit their exact distribution to compute this expectation.
+Recall from \citet[Proposition~1]{mourtada2020minimax}
+that we can write
+%
+\begin{align*}
+|T_{b r}(x)|
+&= \prod_{j=1}^{d}
+\left(
+\frac{E_{1j}}{a_r \lambda} \wedge x_j
++ \frac{E_{2j}}{a_r \lambda} \wedge (1 - x_j)
+\right), \\
+|T_{b' r'}(x)|
+&=
+\prod_{j=1}^{d}
+\left(
+\frac{E_{3j}}{a_{r'} \lambda} \wedge x_j
++ \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j)
+\right), \\
+|T_{b r }(x)\cap T_{b' r'}(x)|
+&= \prod_{j=1}^{d}
+\left(
+\frac{E_{1j}}{a_r \lambda} \wedge
+\frac{E_{3j}}{a_{r'} \lambda}
+\wedge x_j
++ \frac{E_{2j}}{a_r \lambda} \wedge
+\frac{E_{4j}}{a_{r'} \lambda}
+\wedge (1 - x_j)
+\right)
+\end{align*}
+%
+where $E_{1j}$, $E_{2j}$, $E_{3j}$, and $E_{4j}$
+are independent and $\Exp(1)$.
+Define their non-truncated versions
+%
+\begin{align*}
+|\tilde T_{b r}(x)|
+&=
+a_r^{-d}
+\lambda^{-d}
+\prod_{j=1}^{d}
+\left( E_{1j} + E_{2j} \right), \\
+|\tilde T_{b' r'}(x)|
+&=
+a_{r'}^{-d}
+\lambda^{-d}
+\prod_{j=1}^{d}
+\left( E_{3j} + E_{4j} \right), \\
+|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|
+&=
+\lambda^{-d}
+\prod_{j=1}^{d}
+\left(
+\frac{E_{1j}}{a_r}
+\wedge
+\frac{E_{3j}}{a_{r'}}
++ \frac{E_{2j}}{a_r}
+\wedge
+\frac{E_{4j}}{a_{r'}}
+\right),
+\end{align*}
+%
+and note that
+%
+\begin{align*}
+&\P \left(
+\big( \tilde T_{b r}(x), \tilde T_{b' r'}(x),
+\tilde T_{b r}(x) \cap T_{b' r'}(x) \big)
+\neq
+\big( T_{b r}(x), T_{b' r'}(x), T_{b r}(x) \cap T_{b' r'}(x) \big)
+\right) \\
+&\,\leq
+\sum_{j=1}^{d}
+\big(
+\P(E_{1j} \geq a_r \lambda x_j)
++ \P(E_{3j} \geq a_{r'} \lambda x_j)
++ \P(E_{2j} \geq a_r \lambda (1 - x_j))
++ \P(E_{4j} \geq a_{r'} \lambda (1 - x_j))
+\big) \\
+&\,\leq e^{-C \lambda}
+\end{align*}
+%
+for some $C > 0$ and sufficiently large $\lambda$.
+So by Cauchy--Schwarz and Lemma~\ref{lem:mondrian_app_moment_cell},
+%
+\begin{align*}
+&
+\frac{n^2}{\lambda^d}
+\left|
+\E \left[
+\frac{|T_{b r}(x) \cap T_{b' r'}(x)|}
+{n f(x) |T_{b r}(x)|+1}
+\frac{1}
+{n f(x) |T_{b' r'}(x)|+1}
+\right]
+- \E \left[
+\frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
+{n f(x) |\tilde T_{b r}(x)|+1}
+\frac{1}
+{n f(x) |\tilde T_{b' r'}(x)|+1}
+\right]
+\right| \\
+&\quad\lesssim
+\frac{n^2}{\lambda^d}
+e^{-C \lambda}
+\lesssim
+e^{-C \lambda / 2}
+\end{align*}
+%
+as $\log \lambda \gtrsim \log n$.
+Therefore
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\sigma^2(x)
+f(x)
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)}
+\right] \\
+&\quad+
+O \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{\lambda^d \log n}{n}
+\right).
+\end{align*}
+%
+We remove the superfluous units in the denominators.
+Firstly, by independence of the trees,
+%
+\begin{align*}
+& \frac{n^2}{\lambda^d}
+\left|
+\E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)}
+\right]
+- \E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)}
+\right]
+\right| \\
+&\quad\lesssim
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{n |\tilde T_{b r}(x)|}
+\frac{1}
+{n^2 |\tilde T_{b' r'}(x)|^2}
+\right]
+\lesssim
+\frac{1}{n \lambda^d}
+\E \left[
+\frac{1}{|T_{b r}(x)|}
+\right]
+\E \left[
+\frac{1}{|T_{b' r'}(x)|}
+\right]
+\lesssim
+\frac{\lambda^d}{n}.
+\end{align*}
+%
+Secondly, we have in exactly the same manner that
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\left|
+\E \left[
+\frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
+{(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)}
+\right]
+- \E \left[
+\frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|}
+{n^2 f(x)^2 |\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
+\right]
+\right|
+&\lesssim
+\frac{\lambda^d}{n}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right]
+&=
+\frac{\sigma^2(x)}{f(x)}
+\frac{1}{\lambda^d}
+\E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
+\right]
++ O \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{\lambda^d \log n}{n}
+\right).
+\end{align*}
+%
+It remains to compute this integral.
+By independence over $1 \leq j \leq d$,
+%
+\begin{align*}
+&\E \left[
+\frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|}
+{|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|}
+\right] \\
+&\quad=
+a_r^d a_{r'}^d \lambda^d
+\prod_{j=1}^d
+\E \left[
+\frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})
++ (E_{2j} a_r) \wedge (E_{4j} / a_{r'}) }
+{ \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right)}
+\right] \\
+&\quad=
+2^d a_r^d a_{r'}^d \lambda^d
+\prod_{j=1}^d
+\E \left[
+\frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})}
+{ \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right) }
+\right] \\
+&\quad=
+2^d a_r^d a_{r'}^d \lambda^d
+\prod_{j=1}^d
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+\frac{ (t_1 / a_r) \wedge (t_3 / a_{r'}) }
+{ \left( t_1 + t_2 \right) \left( t_3 + t_4 \right) }
+e^{-t_1 - t_2 - t_3 - t_4}
+\diff t_1
+\diff t_2
+\diff t_3
+\diff t_4 \\
+&\quad=
+2^d a_r^d a_{r'}^d \lambda^d
+\prod_{j=1}^d
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+((t_1 / a_r) \wedge (t_3 / a_{r'}))
+e^{-t_1 - t_3} \\
+&\qquad\times
+\left(
+\int_{0}^{\infty}
+\frac{e^{-t_2}}{t_1 + t_2}
+\diff t_2
+\right)
+\left(
+\int_{0}^{\infty}
+\frac{e^{-t_4}}{t_3 + t_4}
+\diff t_4
+\right)
+\diff t_1
+\diff t_3 \\
+&\quad=
+2^d a_r^d a_{r'}^d \lambda^d
+\prod_{j=1}^d
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+((t / a_r) \wedge (s / a_{r'}))
+\Gamma(0, t)
+\Gamma(0, s)
+\diff t
+\diff s,
+\end{align*}
+%
+as $\int_0^\infty \frac{e^{-t}}{a + t} \diff t = e^a \Gamma(0, a)$
+with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$. Now
+%
+\begin{align*}
+&2
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+((t / a_r) \wedge (s / a_{r'}))
+\Gamma(0, t)
+\Gamma(0, s)
+\diff t
+\diff s \\
+&\quad=
+\int_0^\infty
+\Gamma(0, t)
+\left(
+\frac{1}{a_{r'}}
+\int_0^{a_{r'} t / a_r}
+2 s \Gamma(0, s)
+\diff{s}
++
+\frac{t}{a_r}
+\int_{a_{r'} t / a_r}^\infty
+2 \Gamma(0, s)
+\diff{s}
+\right)
+\diff{t} \\
+&\quad=
+\int_0^\infty
+\Gamma(0, t)
+\left(
+\frac{t}{a_r}
+e^{- \frac{a_{r'}}{a_r}t}
+- \frac{1}{a_{r'}} e^{- \frac{a_{r'}}{a_r}t}
++ \frac{1}{a_{r'}}
+- \frac{a_{r'}}{a_r^2} t^2
+\Gamma\left(0, \frac{a_{r'}}{a_r} t\right)
+\right)
+\diff{t} \\
+&\quad=
+\frac{1}{a_r}
+\int_0^\infty
+t e^{- \frac{a_{r'}}{a_r} t}
+\Gamma(0, t)
+\diff{t}
+- \frac{1}{a_{r'}}
+\int_0^\infty
+e^{- \frac{a_{r'}}{a_r} t}
+\Gamma(0, t)
+\diff{t} \\
+&\qquad+
+\frac{1}{a_{r'}}
+\int_0^\infty
+\Gamma(0, t)
+\diff{t}
+-
+\frac{a_{r'}}{a_r^2}
+\int_0^\infty
+t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right)
+\Gamma(0, t)
+\diff{t},
+\end{align*}
+%
+since
+$\int_0^a 2 t \Gamma(0, t) \diff t = a^2 \Gamma(0, a) - a e^{-a} -e^{-a} + 1$
+and
+$\int_a^\infty \Gamma(0, t) \diff t = e^{-a} - a \Gamma(0, a)$.
+Next, we use
+%
+$ \int_{0}^{\infty} \Gamma(0, t) \diff t = 1$,
+$\int_{0}^{\infty} e^{-at} \Gamma(0, t) \diff t
+= \frac{\log(1+a)}{a}$,
+$\int_{0}^{\infty} t e^{-at} \Gamma(0, t) \diff t
+= \frac{\log(1+a)}{a^2} - \frac{1}{a(a+1)}$,
+and
+$\int_{0}^{\infty} t^2 \Gamma(0, t) \Gamma(0, at) \diff t
+= - \frac{2a^2 + a + 2}{3a^2 (a+1)} + \frac{2(a^3 + 1) \log(a+1)}{3a^3}
+- \frac{2 \log a}{3}$
+to see
+%
+\begin{align*}
+&2
+\int_{0}^{\infty}
+\int_{0}^{\infty}
+((t / a_r) \wedge (s / a_{r'}))
+\Gamma(0, t)
+\Gamma(0, s)
+\diff t
+\diff s \\
+&\quad=
+\frac{a_r \log(1+a_{r'} / a_r)}{a_{r'}^2}
+- \frac{a_r / a_{r'}}{a_r + a_{r'}}
+- \frac{a_r \log(1 + a_{r'} / a_r)}{a_{r'}^2}
++ \frac{1}{a_{r'}} \\
+&\qquad+
+\frac{2 a_{r'}^2 + a_r a_{r'} + 2 a_r^2}
+{3 a_r a_{r'} (a_r + a_{r'})}
+- \frac{2(a_{r'}^3 + a_r^3) \log(a_{r'} / a_r+1)}{3 a_r^2 a_{r'}^2}
++ \frac{2 a_{r'} \log (a_{r'} / a_r)}{3 a_r^2} \\
+&\quad=
+\frac{2}{3 a_r} + \frac{2}{3 a_{r'}}
+- \frac{2(a_r^3 + a_{r'}^3 ) \log(a_{r'} / a_{r}+1)}
+{3 a_r^2 a_{r'}^2}
++ \frac{2 a_{r'} \log (a_{r'} / a_{r})}{3 a_r^2} \\
+&\quad=
+\frac{2}{3 a_r}
++ \frac{2}{3 a_{r'}}
+- \frac{2 a_{r'} \log(a_{r} / a_{r'} + 1)}{3 a_r^2}
+- \frac{2 a_r \log(a_{r'} / a_{r} + 1)}{3 a_{r'}^2} \\
+&\quad=
+\frac{2}{3 a_r}
+\left(
+1 - \frac{a_{r'}}{a_r}
+\log\left(\frac{a_{r}}{a_{r'}} + 1\right)
+\right)
++ \frac{2}{3 a_{r'}}
+\left(
+1 - \frac{a_r }{a_{r'}}
+\log\left(\frac{a_{r'}}{a_{r}} + 1\right)
+\right).
+\end{align*}
+%
+Finally, we conclude by giving the limiting variance.
+%
+\begin{align*}
+&\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\frac{n^2}{\lambda^d}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+&\quad=
+\frac{\sigma^2(x)}{f(x)}
+\sum_{r=0}^{J}
+\sum_{r'=0}^{J}
+\omega_r
+\omega_{r'}
+\left(
+\frac{2 a_{r'}}{3}
+\left(
+1 - \frac{a_{r'}}{a_r}
+\log\left(\frac{a_r}{a_{r'}} + 1\right)
+\right)
++ \frac{2 a_r}{3}
+\left(
+1 - \frac{a_r}{a_{r'}}
+\log\left(\frac{a_{r'}}{a_r} + 1\right)
+\right)
+\right)^d \\
+&\qquad+
+O \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{\lambda^d \log n}{n}
+\right).
+\end{align*}
+%
+So the limit exists, and
+with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}}
+\log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$,
+the limiting variance is
+%
+\begin{align*}
+\Sigma_\rd(x)
+&=
+\frac{\sigma^2(x)}{f(x)}
+\sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'}
+\left( \ell_{r r'} + \ell_{r' r} \right)^d.
+\end{align*}
+%
+\end{proof}
+
+The new bias characterization with debiasing is an algebraic
+consequence of the original bias characterization and the construction
+of the debiased Mondrian random forest estimator.
+
+\begin{proof}[Theorem~\ref{thm:mondrian_bias_debiased}]
+
+By the definition of the debiased estimator and
+Theorem~\ref{thm:mondrian_bias}, since $J$ and $a_r$ are fixed,
+%
+\begin{align*}
+\E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big]
+&=
+\sum_{l=0}^J
+\omega_l
+\E \big[
+\hat \mu_l(x)
+\Bigm| \bX, \bT
+\big] \\
+&=
+\sum_{l=0}^J
+\omega_l
+\left(
+\mu(x)
++ \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
+\frac{B_r(x)}{a_l^{2r} \lambda^{2r}}
+\right)
++ O_\P \left(
+\frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{align*}
+%
+It remains to evaluate the first term.
+Recalling that $A_{r s} = a_{r-1}^{2 - 2s}$
+and $A \omega = e_0$, we have
+%
+\begin{align*}
+&\sum_{l=0}^J
+\omega_l
+\left(
+\mu(x)
++ \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
+\frac{B_r(x)}{a_l^{2r} \lambda^{2r}}
+\right) \\
+&\quad=
+\mu(x)
+\sum_{l=0}^J
+\omega_l
++
+\sum_{r=1}^{\lfloor \flbeta / 2 \rfloor}
+\frac{B_r(x)}{\lambda^{2r}}
+\sum_{l=0}^J
+\frac{\omega_l}{a_l^{2r}} \\
+&\quad=
+\mu(x)
+(A \omega)_1
++ \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor \wedge J}
+\frac{B_r(x)}{\lambda^{2r}}
+(A \omega)_{r+1}
++ \sum_{r = (\lfloor \flbeta / 2 \rfloor \wedge J) + 1}
+^{\lfloor \flbeta / 2 \rfloor}
+\frac{B_r(x)}{\lambda^{2r}}
+\sum_{l=0}^J
+\frac{\omega_l}{a_l^{2r}} \\
+&\quad=
+\mu(x)
++ \I\{\lfloor \flbeta / 2 \rfloor \geq J + 1\}
+\frac{B_{J+1}(x)}{\lambda^{2J + 2}}
+\sum_{l=0}^J
+\frac{\omega_l}{a_l^{2J + 2}}
++ O \left( \frac{1}{\lambda^{2J + 4}} \right) \\
+&\quad=
+\mu(x)
++ \I\{2J + 2 < \beta\}
+\frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}}
++ O \left( \frac{1}{\lambda^{2J + 4}} \right).
+\end{align*}
+%
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation_debiased}]
+
+\proofparagraph{consistency of $\hat\sigma^2(x)$}
+
+Recall that
+%
+\begin{align}
+\label{eq:mondrian_app_sigma2_hat_proof}
+\hat\sigma^2(x)
+&=
+\frac{1}{B}
+\sum_{b=1}^{B}
+\frac{\sum_{i=1}^n Y_i^2 \, \I\{X_i \in T_b(x)\}}
+{\sum_{i=1}^n \I\{X_i \in T_b(x)\}}
+- \hat \mu(x)^2.
+\end{align}
+%
+The first term in \eqref{eq:mondrian_app_sigma2_hat_proof}
+is simply a Mondrian forest estimator of
+$\E[Y_i^2 \mid X_i = x] = \sigma^2(x) + \mu(x)^2$,
+which is bounded and Lipschitz,
+where $\E[Y_i^4 \mid X_i]$ is bounded almost surely.
+So its conditional bias is controlled
+by Theorem~\ref{thm:mondrian_bias} and is at most
+$O_\P \left( \frac{1}{\lambda} +
+\frac{\log n}{\lambda} \sqrt{\lambda^d / n} \right)$.
+Its variance is
+at most $\frac{\lambda^d}{n}$ by Theorem~\ref{thm:mondrian_clt_debiased}.
+Consistency of the second term in \eqref{eq:mondrian_app_sigma2_hat_proof}
+follows directly from Theorems~\ref{thm:mondrian_bias} and
+\ref{thm:mondrian_clt_debiased} with the same bias and variance bounds.
+Therefore
+%
+\begin{align*}
+\hat\sigma^2(x)
+&=
+\sigma^2(x)
++ O_\P \left(
+\frac{1}{\lambda}
++ \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{align*}
+
+\proofparagraph{consistency of the sum}
+%
+Note that
+%
+\begin{align*}
+&\frac{n}{\lambda^d}
+\sum_{i=1}^n
+\left(
+\sum_{r=0}^J
+\omega_r
+\frac{1}{B}
+\sum_{b=1}^B
+\frac{\I\{X_i \in T_{r b}(x)\}}
+{\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}}
+\right)^2 \\
+&\quad=
+\frac{n}{\lambda^d}
+\frac{1}{B^2}
+\sum_{i=1}^n
+\sum_{r=0}^J
+\sum_{r'=0}^J
+\omega_r
+\omega_{r'}
+\sum_{b=1}^B
+\sum_{b'=1}^B
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}.
+\end{align*}
+%
+This is exactly the same as the quantity in
+\eqref{eq:mondrian_app_clt_condition_sum}, if we were to take
+$\varepsilon_i$ to be $\pm 1$ with equal probability.
+Thus we immediately have convergence in probability
+by the proof of Theorem~\ref{thm:mondrian_clt_debiased}:
+%
+\begin{align*}
+\frac{n}{\lambda^d}
+\sum_{i=1}^n
+\left(
+\sum_{r=0}^J
+\omega_r
+\frac{1}{B}
+\sum_{b=1}^B
+\frac{\I\{X_i \in T_{r b}(x)\}}
+{\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}}
+\right)^2
+&=
+\frac{n^2}{\lambda^d}
+\sum_{r=0}^J
+\sum_{r'=0}^J
+\omega_r
+\omega_{r'}
+\E \left[
+\frac{\I_{i b r}(x) \I_{i b' r'}(x)}
+{N_{b r}(x) N_{b' r'}(x)}
+\right] \\
+&\quad+
+O_\P \left(
+\frac{1}{\sqrt B}
++ \sqrt{\frac{\lambda^d \log n}{n}}
+\right).
+\end{align*}
+
+\proofparagraph{conclusion}
+
+By the proof of Theorem~\ref{thm:mondrian_clt_debiased}
+with $\varepsilon_i$ being $\pm 1$ with equal probability,
+and by previous parts,
+%
+\begin{align*}
+\hat\Sigma_\rd(x)
+= \Sigma_\rd(x)
++ O_\P \left(
+\frac{(\log n)^{d+1}}{\lambda}
++ \frac{1}{\sqrt B}
++ \sqrt{\frac{\lambda^d \log n}{n}}
+\right).
+\end{align*}
+
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_confidence_debiased}]
+%
+By Theorem~\ref{thm:mondrian_bias_debiased}
+and Theorem~\ref{thm:mondrian_variance_estimation_debiased},
+%
+\begin{align*}
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu_\rd(x) - \mu(x)}{\hat \Sigma_\rd(x)^{1/2}}
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]}
+{\hat \Sigma_\rd(x)^{1/2}}
++ \sqrt{\frac{n}{\lambda^d}}
+\frac{\E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x)}
+{\hat \Sigma_\rd(x)^{1/2}} \\
+&=
+\sqrt{\frac{n}{\lambda^d}}
+\frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]}
+{\hat \Sigma_\rd(x)^{1/2}}
++ \sqrt{\frac{n}{\lambda^d}} \,
+O_\P \left(
+\frac{1}{\lambda^\beta}
++ \frac{1}{\lambda \sqrt B}
++ \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}
+\right).
+\end{align*}
+%
+The first term converges weakly to $\cN(0,1)$ by
+Slutsky's theorem and Theorems~\ref{thm:mondrian_clt_debiased}
+and \ref{thm:mondrian_variance_estimation_debiased},
+while the second is $o_\P(1)$ by assumption.
+Validity of the confidence interval follows.
+%
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:mondrian_minimax}]
+
+Theorem~\ref{thm:mondrian_bias_debiased}
+and the proof of Theorem~\ref{thm:mondrian_clt_debiased}
+with $J = \lfloor \flbeta / 2 \rfloor$ gives
+%
+\begin{align*}
+\E \left[
+\big(
+\hat \mu_\rd(x)
+- \mu(x)
+\big)^2
+\right]
+&=
+\E \left[
+\big(
+\hat \mu_\rd(x)
+- \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]
+\big)^2
+\right]
++ \E \left[
+\big(
+\E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]
+- \mu(x)
+\big)^2
+\right] \\
+&\lesssim
+\frac{\lambda^d}{n}
++ \frac{1}{\lambda^{2\beta}}
++ \frac{1}{\lambda^2 B}.
+\end{align*}
+%
+We use here an $L^2$ version of Theorem~\ref{thm:mondrian_bias_debiased}
+which is immediate from the proof of Theorem~\ref{thm:mondrian_bias},
+since we leveraged Chebyshev's inequality. Now since
+$\lambda \asymp n^{\frac{1}{d + 2 \beta}}$ and
+$B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$,
+%
+\begin{align*}
+\E \left[
+\big(
+\hat \mu_\rd(x)
+- \mu(x)
+\big)^2
+\right]
+&\lesssim
+n^{-\frac{2\beta}{d + 2 \beta}}.
+\end{align*}
+\end{proof}
+
+\section{Further properties of the Mondrian process}
+
+In section, we state and prove a collection of lemmas concerning
+various properties of the Mondrian process. While they are not used directly
+in our analysis of Mondrian random forest estimators, we believe that
+these results, along with the techniques displayed during their proofs,
+may be of potential independent interest.
+
+Our analysis of Mondrian random forest estimators in the main text
+is for the most part
+conducted pointwise, in the sense that we first fix $x \in [0,1]^d$
+and then analyze $\hat\mu(x)$. This means that we interact with the Mondrian
+process
+only through $T(x)$; that is, the cell in $T$ which contains the point $x$.
+As such, we rely only on local properties of $T$, and may consider just a
+single Mondrian cell. The lemmas in this section take a more global approach
+to analyzing the Mondrian process, and we make statements about the
+entire process $T$, rather than individual cells $T(x)$.
+Such results may be useful for a future investigation of the uniform
+properties of Mondrian forest estimators, as well as
+being interesting in their own right.
+
+We begin with a tail bound for the number of cells appearing
+in a Mondrian tree, offering a multiplicative
+exponential inequality which
+complements the exact expectation result given in
+\citet[Proposition~2]{mourtada2020minimax}.
+The resulting bound in probability is the same up to
+logarithmic terms, and the sharp tail decay is useful
+in combination with union bounds in our upcoming results.
+
+\begin{lemma}[Tail bound for the number of cells in a Mondrian tree]
+\label{lem:mondrian_app_cells_tail}
+
+Let $D \subseteq \R^d$ be a rectangle and
+$T \sim \cM(D, \lambda)$. Writing
+$\# T$ for the number of cells in $T$,
+%
+\begin{align*}
+\P\left(
+\# T > 3 (1 + \lambda |D|_1)^d
+(t + 1 + d \log(1 + \lambda |D|_1))
+\right)
+&\leq
+e^{-t}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_cells_tail}]
+
+We refer to this method as the ``subcell trick''
+and attribute it to \citet{mourtada2017universal}.
+For $\varepsilon > 0$, partition $D$ into
+at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$
+with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$.
+Denote the restriction of a tree $T$ to a subcell $D'$ by $T \cap D'$.
+Since a split in $T$ induces a split in at least one $T \cap D'$,
+by a union bound
+%
+\begin{align*}
+\P\left(\# T > t \right)
+&\leq
+\P\left(\sum_{D' \in \cD_\varepsilon}
+\# (T \cap D') > t \right)
+\leq
+\sum_{D' \in \cD_\varepsilon}
+\P\left(
+\# (T \cap D') >
+\frac{t}{\# \cD_\varepsilon}
+\right).
+\end{align*}
+%
+Now $\# (T \cap D')$ is dominated by a Yule process
+with parameter $|D'|_1$ stopped at time $\lambda$
+\citep[proof of Lemma~2]{mourtada2017universal},
+so using that fact that if
+$X \sim \Yule(a)$
+then $\P(X_t > n) \leq (1-e^{-at})^{n-1}$,
+%
+\begin{align*}
+\P\left(\# T > t \right)
+&\leq
+\# \cD_\varepsilon \,
+(1 - e^{-\lambda |D|_1 \varepsilon})^{t / \# \cD_\varepsilon - 1}
+\leq
+(1 + 1/\varepsilon)^d
+(1 - e^{-\lambda |D|_1 \varepsilon})^{t (1 + 1/\varepsilon)^{-d} - 1}.
+\end{align*}
+%
+Set $\varepsilon = \frac{1}{\lambda |D|_1}$,
+note $1-1/e \leq e^{-1/3}$
+and replace $t$ by
+$3 (1 + \lambda |D|_1)^d
+(t + 1 + d \log(1 + \lambda |D|_1))$:
+%
+\begin{align*}
+&\P\left(\# T > t \right)
+\leq
+(1 + \lambda |D|_1)^d
+(1 - 1/e)^{t (1 + \lambda |D|_1)^{-d} - 1}
+\leq
+2 (1 + \lambda |D|_1)^d
+e^{-t (1 + \lambda |D|_1)^{-d} / 3}, \\
+&\P\left(\# T >
+3
+(1 + \lambda |D|_1)^d
+(t + 1 + d \log(1 + \lambda |D|_1))
+\right)
+\leq
+e^{-t}.
+\end{align*}
+%
+\end{proof}
+
+Next we provide a rigorous justification to the observation that the cells
+in a Mondrian process should have the same shape distribution, though
+of course they are not independent. To state and prove this result,
+we need a way to identify a particular cell by endowing the
+cells in a Mondrian tree with a natural order.
+
+\begin{definition}[Canonical order of cells in a Mondrian tree]
+Let $T \sim \cM(D, \lambda)$.
+Each cell in a fixed realization of $T$ can be described by
+a word from the alphabet $\{l, r\}$,
+where $l$ indicates the cell to the left of a split
+and $r$ indicates the cell to the right.
+For example, if there are no splits we have one cell
+described by the empty word.
+After one split there are two cells, denoted
+$l$ and $r$.
+Now suppose that the cell $r$ splits again, giving two splits and three cells,
+denoted $l$, $r l$, and $r r$.
+Define the canonical ordering of the cells of $T$ by applying
+the lexicographic order to their words, with $l < r$.
+Note that it does not matter which coordinate each split occurs in:
+in two dimensions, $l$ might refer to the ``left'' or ``bottom''
+and $r$ to the ``right'' or ``top'' cell.
+\end{definition}
+
+\begin{lemma}[Cells in a Mondrian tree have identically distributed shapes]
+\label{lem:mondrian_app_cells_identically_distributed}
+
+Let $T \sim \cM(D, \lambda)$
+with ordered cells $D'_1, \ldots, D'_{\# T}$.
+For $\varepsilon_1, \ldots, \varepsilon_d \geq 0$
+and $1 \leq i \leq k$,
+%
+\begin{align*}
+\P\left(
+|D'_{i1}| \leq \varepsilon_1,
+\ldots, |D'_{id}| \leq \varepsilon_d,
+\# T = k
+\right)
+&=
+\P\left(
+|D'_{11}| \leq \varepsilon_1,
+\ldots, |D'_{1d}| \leq \varepsilon_d,
+\# T = k
+\right).
+\end{align*}
+%
+Marginalizing over $\# T$
+with $E_j$ i.i.d.\ $\Exp(1)$,
+\citet[Proposition~1]{mourtada2020minimax} gives
+%
+\begin{align*}
+\P\left(
+|D'_{i1}| > \varepsilon_1,
+\ldots, |D'_{id}| > \varepsilon_d
+\right)
+&=
+\prod_{j=1}^d
+\P\left(
+\frac{E_j}{\lambda} \wedge |D_j|
+> \varepsilon_j
+\right)
+= \prod_{j=1}^d
+\I\{|D_j| > \varepsilon_j\}
+e^{-\lambda \varepsilon_j}.
+\end{align*}
+
+\end{lemma}
+
+We observe a version of the famous Poisson process inspection or waiting time
+paradox in the sizes of Mondrian cells. The above
+Lemma~\ref{lem:mondrian_app_cells_identically_distributed} shows that for a
+large enough
+lifetime $\lambda$, the volume of any cell $D$ has the same distribution as the
+volume of a corner cell, and is asymptotically
+$\E[|D|] \asymp \E \left[ \prod_{j=1}^{d} (E_j / \lambda) \right]
+= 1/\lambda^d$.
+This is consistent with \citet[Proposition~2]{mourtada2020minimax} who give
+$\E[\# T] \asymp \lambda^d$.
+However, if instead of selecting a cell directly,
+we instead select a fixed interior point $x$
+and query the cell $T(x)$ which contains it, we find that
+$\E[|T(x)|] \asymp \E \left[
+\prod_{j=1}^{d} ((E_{1j} + E_{2j}) / \lambda) \right]
+= 2^d/\lambda^d$, where $E_{1j}, E_{2j}$ are i.i.d.\ $\Exp(1)$,
+by \citet[Proposition~1]{mourtada2020minimax}.
+Since $T(x)$ contains $x$ by construction, a size-biasing phenomenon occurs
+and we see that $T(x)$ is on average larger than a typical Mondrian cell.
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_cells_identically_distributed}]
+
+Let $w$ be the word associated with the cell $D_i \in T$.
+Note that $i=1$ if and only if $r \notin w$, as then $D_i$ is the left child
+of every split.
+So suppose $r \in w$.
+Let $\tilde w$ be the word obtained by replacing all occurrences
+of $r$ in $w$ with an $l$.
+Each such replacement corresponds to a split in $T$.
+Let $\tilde T$ be the same process as $T$ but with the following
+modification: for each split where a replacement was made,
+change the uniform random variable $S$
+(from the definition of $T$, see Section~\ref{sec:mondrian_process}) to $1-S$.
+Since $S$ is independent of everything else in the construction of $T$,
+we observe that $\tilde T \sim \cM(D, \lambda)$ also.
+Further, there is almost surely exactly one cell in $\tilde T$
+which has the same shape as $D$, as the uniform distribution has no atoms.
+Denote this cell by $\tilde D$ and note that
+the replacements imply that its word in $\tilde T$
+is $\tilde w$.
+Thus $\tilde D = \tilde D_1$ in $\tilde T$ and so
+$(|D_{i1}|, \ldots, |D_{i d}|, \# T)
+= (|\tilde D_{11}|, \ldots, |\tilde D_{1d}|, \# \tilde T)$.
+Equality of the distributions follows.
+\end{proof}
+
+As our next result we provide a tail bound for the size of the largest
+Mondrian cell. The cells within a Mondrian tree are of course not independent,
+and in fact there should intuitively be some negative correlation between their
+sizes, due to the fact that they must all fit within the original cell $D$.
+
+\begin{lemma}[Tail bound on largest Mondrian cell]
+\label{lem:mondrian_app_largest_cell_tail}
+
+Let $T \sim \cM(D, \lambda)$.
+For any $\varepsilon > 0$,
+%
+\begin{align*}
+\P\left(
+\max_{D' \in T}
+\max_{1 \leq j \leq d}
+|D'_j| > \varepsilon
+\right)
+&\leq
+5d (1 + \lambda |D|_1)^{d+1}
+e^{-\lambda \varepsilon}.
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell_tail}]
+
+Let $D_i$ be the ordered cells of $T$ and take $k \geq 1$.
+By union bounds and
+Lemma~\ref{lem:mondrian_app_cells_identically_distributed},
+%
+\begin{align*}
+\P\left(
+\max_{D' \in T}
+\max_{1 \leq j \leq d}
+|D'_j| > \varepsilon
+\right)
+&\leq
+\sum_{l=1}^k
+\P\left(
+\max_{1 \leq i \leq l}
+\max_{1 \leq j \leq d}
+|D_{i j}| > \varepsilon,
+\# T = l
+\right)
++ \P\left( \# T > k \right) \\
+&\leq
+\sum_{l=1}^k
+\sum_{i=1}^l
+\sum_{j=1}^d
+\P\big(
+|D_{i j}| > \varepsilon,
+\# T = l
+\big)
++ \P\left( \# T > k \right) \\
+&\leq
+\sum_{l=1}^k
+l d \,
+\P\big(
+|D_{1j}| > \varepsilon,
+\# T = l
+\big)
++ \P\left( \# T > k \right) \\
+&\leq
+k d \,
+\P\big(|D_{1 j}| > \varepsilon \big)
++ \P\left( \# T > k \right).
+\end{align*}
+%
+For the first term we use the exact distribution of
+$D_1$ from Lemma~\ref{lem:mondrian_app_cells_identically_distributed}
+and for the second term we apply Lemma~\ref{lem:mondrian_app_cells_tail}.
+%
+\begin{align*}
+\P\left(
+\max_{D' \in T}
+\max_{1 \leq j \leq d}
+|D'_j| > \varepsilon
+\right)
+&\leq
+k d \, \P\big(|D_{1 j}| > \varepsilon \big)
++ \P\left( \# T > k \right) \\
+&\leq
+k d \, e^{-\lambda \varepsilon}
++ 2 (1 + \lambda |D|_1)^d
+e^{-k (1 + \lambda |D|_1)^{-d} / 3}.
+\end{align*}
+%
+Finally, set
+$k = \big\lceil 3 \lambda \varepsilon (1 + \lambda |D|_1)^d \big\rceil$
+and note the bound is trivial unless $\varepsilon \leq |D|_1$.
+%
+\begin{align*}
+\P\left(
+\max_{D' \in T}
+\max_{1 \leq j \leq d}
+|D'_j| > \varepsilon
+\right)
+&\leq
+\big( 3 \lambda \varepsilon (1 + \lambda |D|_1)^d + 1 \big)
+d \, e^{-\lambda \varepsilon}
++ 2 (1 + \lambda |D|_1)^d
+e^{-\lambda \varepsilon} \\
+&\leq
+3d (1 + \lambda |D|_1)^{d+1}
+e^{-\lambda \varepsilon}
++ 2 (1 + \lambda |D|_1)^d
+e^{-\lambda \varepsilon} \\
+&\leq
+5d (1 + \lambda |D|_1)^{d+1}
+e^{-\lambda \varepsilon}.
+\end{align*}
+%
+\end{proof}
+
+For the remainder of this section, we turn our attention to the partitions
+generated by Mondrian random forests. In particular, we study the refinement
+generated by overlaying $B$ independent Mondrian processes with possibly
+different lifetime parameters, and intersecting their resulting individual
+partitions.
+
+\begin{definition}[Partition refinement]%
+%
+Let $T_1, \ldots, T_B$ be partitions of a set.
+Their common refinement is
+%
+\begin{align*}
+\bigwedge_{b=1}^B T_b
+= \left\{
+\bigcap_{b=1}^B D_b:
+D_b \in T_b
+\right\}
+\bigsetminus
+\left\{ \emptyset \right\}.
+\end{align*}
+%
+\end{definition}
+
+We begin our analysis of Mondrian forest refinements with a pair of simple
+inequalities for bounding the total number of refined cells
+in Lemma~\ref{lem:mondrian_app_refinement_inequalities}. This result does not
+depend
+on the probabilistic structure of the Mondrian process, and holds for any
+rectangular partitions.
+
+\begin{lemma}[Inequalities for refinements of rectangular partitions]
+\label{lem:mondrian_app_refinement_inequalities}
+
+Let $T_1, \ldots, T_B$ be rectangular partitions of a $d$-dimensional
+rectangle $D$. Then
+%
+\begin{align}
+\label{eq:mondrian_app_refinement_1}
+\# \bigwedge_{b=1}^B T_b
+&\leq \prod_{b=1}^B \# T_b,
+\end{align}
+%
+and for all $B \leq d$ there exist $T_b$ such that
+\eqref{eq:mondrian_app_refinement_1} holds with equality.
+If $\# T_{b j}$ denotes the number of splits
+made by $T_b$ in dimension $j$, then
+%
+\begin{align}
+\label{eq:mondrian_app_refinement_2}
+\# \bigwedge_{b=1}^B T_b
+&\leq \prod_{j=1}^d
+\left( 1 + \sum_{b=1}^B \# T_{b j} \right),
+\end{align}
+%
+and for all $B \geq d$ there exist $T_b$ such that
+\eqref{eq:mondrian_app_refinement_2} holds with equality.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_inequalities}]
+
+The first inequality \eqref{eq:mondrian_app_refinement_1}
+follows because every cell in
+$\bigwedge_b T_b$ is the intersection of cells
+$D_b \in T_b$ for $1 \leq b \leq B$, and there at at most
+$\prod_{b=1}^{B} \# T_b$ ways to choose these.
+This bound is achievable when $B \leq d$ by setting
+$T_b$ to be a tree with splits only in dimension $b$,
+so that every such intersection of cells
+gives a cell in the refinement.
+
+For the second inequality \eqref{eq:mondrian_app_refinement_2},
+we construct a new forest of trees.
+In particular, for each $1 \leq j \leq d$ define
+$A_j$ to be the set of locations in $D_j$ where a tree $T_b$
+makes a split in dimension $j$ for some $b$.
+Define $T'_j$ to be a tree which has splits
+only in dimension $j$ and at the locations prescribed by $A_j$.
+Clearly, since every split in $T'_j$
+comes from a split in some $T_b$ in dimension $j$,
+we have $\# T'_j \leq 1 + \sum_b \# T_{b j}$.
+Applying the first inequality to this new forest yields
+$\# \bigwedge_j T'_j \leq \prod_j \# T'_j
+\leq \prod_j \big( 1 + \sum_b \# T_{b j} \big)$.
+Finally, note that $\bigwedge_j T'_j$
+is a refinement of $\bigwedge_b T_b$ and the result follows.
+This bound is achievable when $B \geq d$ by letting
+$T_b$ have splits only in dimension $b$ when $b \leq d$
+and to be the trivial partition otherwise.
+%
+\end{proof}
+
+The inequalities in Lemma~\ref{lem:mondrian_app_refinement_inequalities} provide
+rather crude bounds for the number of cells in a Mondrian forest
+refinement as they do not take into account the random structure.
+Indeed, it should be clear that the ``worst case'' scenarios, involving
+trees which contain splits only in a single direction, should be extremely
+unlikely under the Mondrian law. In Lemma~\ref{lem:mondrian_app_refinement} we
+confirm
+this intuition and provide an exact value for the expected number of cells
+in a Mondrian refinement by direct calculation. This result strictly generalizes
+the single tree version provided as \citet[Proposition~2]{mourtada2020minimax}.
+
+\begin{lemma}[Expected number of cells in a Mondrian forest refinement]
+\label{lem:mondrian_app_refinement}
+
+Let $D$ be a $d$-dimensional rectangle
+and take $\lambda_b > 0$ for $1 \leq b \leq B$.
+Let $T_b \sim \cM(D, \lambda_b)$ be independent.
+Then the expected number of cells in their refinement is exactly
+%
+\begin{align*}
+\E\left[\# \bigwedge_{b=1}^B T_b \right]
+&= \prod_{j=1}^d \left(
+1 + |D_j| \sum_{b=1}^B \lambda_b
+\right).
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement}]
+
+By \citet[Proposition~2]{mourtada2020minimax}
+we have the result for a single tree:
+%
+\begin{align}
+\label{eq:mondrian_app_single_tree}
+\E\left[\# T_b \right]
+&= \prod_{j=1}^d \left(
+1 + |D_j| \lambda_b
+\right).
+\end{align}
+%
+We proceed by induction on $B$.
+By the tower law,
+%
+\begin{align*}
+\E\left[\# \bigwedge_{b=1}^B T_b \right]
+&=
+\E\left[
+\sum_{D' \in T_B}
+\#
+\bigwedge_{b=1}^{B-1} (T_b \cap D')
+\right]
+= \E\left[
+\sum_{D' \in T_B}
+\E\left[
+\#
+\bigwedge_{b=1}^{B-1} (T_b \cap D')
+\biggm| T_B
+\right]
+\right].
+\end{align*}
+%
+Now by the restriction property of Mondrian processes
+\citep[Fact~2]{mourtada2020minimax},
+observe that $T_b \cap D' \sim \cM(D', \lambda_b)$
+conditional on $T_B$.
+Then by the induction hypothesis,
+%
+\begin{align*}
+\E\left[
+\#
+\bigwedge_{b=1}^{B-1} (T_b \cap D')
+\biggm| T_B
+\right]
+&=
+\prod_{j=1}^d \left(
+1 + |D'_j| \sum_{b=1}^{B-1} \lambda_b
+\right)
+= \E\big[
+\# T_{D'} \mid T_B
+\big]
+\end{align*}
+%
+where $T_{D'} \sim \cM\big(D', \sum_{b=1}^{B-1} \lambda_B\big)$
+conditional on $T_B$,
+by the result for a single tree \eqref{eq:mondrian_app_single_tree}.
+The restriction property finally shows that there exist realizations
+of $T_{D'}$ which ensure that
+$\sum_{D' \in T_B} \# T_{D'}$ is equal in distribution
+to $\# T$, where $T \sim \cM(D, \sum_{b=1}^B \lambda_b)$,
+so by \eqref{eq:mondrian_app_single_tree},
+%
+\begin{align*}
+\E\left[\# \bigwedge_{b=1}^B T_b \right]
+&=
+\E\left[
+\sum_{D' \in T_B}
+\E\big[
+\# T_{D'} \mid T_B
+\big]
+\right]
+=
+\E\big[\# T \big]
+= \prod_{j=1}^d \left(
+1 + |D_j| \sum_{b=1}^B \lambda_b
+\right).
+\end{align*}
+%
+\end{proof}
+
+While the exact expectation calculation in
+Lemma~\ref{lem:mondrian_app_refinement} is neat,
+sharper control on the tail
+behavior of the number of cells in a Mondrian refinement is desired.
+Lemma~\ref{lem:mondrian_app_refinement_tail} provides this, again
+making use of the subcell trick to convert a crude bound based on
+Lemma~\ref{lem:mondrian_app_refinement_inequalities} into a useful tail
+inequality.
+We assume for simplicity that all of the lifetimes are identical.
+
+\begin{lemma}[Tail bound on the number of cells in a Mondrian forest refinement]
+\label{lem:mondrian_app_refinement_tail}
+
+Let $T_b \sim \cM(D, \lambda)$ be i.i.d.\ for $1 \leq b \leq B$. Then
+%
+\begin{align*}
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d
+\right)
+&\leq
+2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_tail}]
+
+We begin with a coarse estimate and refine it with the subcell trick.
+By Lemma~\ref{lem:mondrian_app_refinement_inequalities}
+\eqref{eq:mondrian_app_refinement_2},
+for any $t > 0$, recalling that $\# T_{b j}$ is the number
+of splits made by $T_b$ in dimension $j$,
+%
+\begin{align}
+\nonumber
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> t
+\right)
+&\leq
+\P\left(
+\prod_{j=1}^d
+\left(
+1 + \sum_{b=1}^B \# T_{b j}
+\right)
+> t
+\right)
+\leq
+\sum_{j=1}^d
+\P\left(
+1 + \sum_{b=1}^B \# T_{b j}
+> t^{1/d}
+\right) \\
+\label{eq:mondrian_app_refinement_tail_coarse}
+&\leq
+d\, \P\left(
+\sum_{b=1}^B \# T_b
+> t^{1/d}
+\right)
+\leq
+d B\,
+\P\left(
+\# T_b > \frac{t^{1/d}}{B}
+\right).
+\end{align}
+%
+By the subcell trick, partition $D$ into
+at most $(1 + 1/\varepsilon)^d$ cells
+$D' \in \cD_\varepsilon$ with side lengths at most
+$(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$.
+As every cell in $\bigwedge_b T_b$ corresponds to
+at least one cell in $\bigwedge_b (T_b \cap D')$,
+%
+\begin{align*}
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> t
+\right)
+&\leq
+\P\left(
+\sum_{D' \in \cD_\varepsilon}
+\# \bigwedge_{b=1}^B (T_b \cap D')
+> t
+\right)
+\leq
+\sum_{D' \in \cD_\varepsilon}
+\P\left(
+\# \bigwedge_{b=1}^B (T_b \cap D')
+> \frac{t}{\# \cD_\varepsilon}
+\right).
+\end{align*}
+%
+Applying the coarse estimate \eqref{eq:mondrian_app_refinement_tail_coarse}
+to $\# \bigwedge_b (T_b \cap D')$ gives
+%
+\begin{align*}
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> t
+\right)
+&\leq
+d B \sum_{D' \in \cD_\varepsilon}
+\P\left(
+\# (T_b \cap D')
+> \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}}
+\right).
+\end{align*}
+%
+Now apply Lemma~\ref{lem:mondrian_app_cells_tail}
+and set $\varepsilon = \frac{1}{\lambda |D|_1}$ to obtain
+%
+\begin{align*}
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> t
+\right)
+&\leq
+d B \sum_{D' \in \cD_\varepsilon}
+\P\left(
+\# (T_b \cap D')
+> \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}}
+\right) \\
+&\leq
+d B \sum_{D' \in \cD_\varepsilon}
+2 (1 + \lambda |D'|_1)^d
+e^{- t^{1/d} \# \cD_\varepsilon^{-1/d} B^{-1}
+(1 + \lambda |D'|_1)^{-d} / 3} \\
+&\leq
+2 d B (1 + 1 / \varepsilon)^d
+(1 + \lambda \varepsilon |D|_1)^d
+e^{- t^{1/d} (1 + 1/\varepsilon)^{-1} B^{-1}
+(1 + \lambda \varepsilon |D|_1)^{-d} / 3} \\
+&\leq
+2^{d+1} d B (1 + \lambda |D|_1)^d
+e^{- t^{1/d} (1 + \lambda |D|_1)^{-1} B^{-1} 2^{-d} / 3}.
+\end{align*}
+%
+Finally, replacing $t$ by $3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d$ we have
+%
+\begin{align*}
+\P\left(
+\# \bigwedge_{b=1}^B T_b
+> 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d
+\right)
+&\leq
+2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}.
+\end{align*}
+%
+\end{proof}
+
+\chapter{Supplement to Dyadic Kernel Density Estimators}
+\label{app:kernel}
+
+This section contains complementary detailed expositions of some
+of our main results, along with additional technical lemmas
+which may be of independent interest. We also provide full proofs
+for all of our theoretical contributions.
+
+\section{Supplementary main results}
+
+In this first section we provide more detailed versions of some of the
+results presented in the main text, alongside some intermediate
+lemmas which were skipped for conciseness.
+We begin with some extra notation used throughout this appendix.
+
+For real vectors,
+$\|\cdot\|_p$ is the standard $\ell^p$-norm defined for $p \in [1, \infty]$.
+For real square matrices,
+$\|\cdot\|_p$ is the operator
+norm induced by the corresponding vector norm.
+In particular,
+$\|\cdot\|_1$
+is the maximum absolute column sum,
+$\|\cdot\|_\infty$
+is the maximum absolute row sum,
+and $\|\cdot\|_2$
+is the maximum singular value.
+For real symmetric matrices,
+$\|\cdot\|_2$
+coincides with the maximum absolute eigenvalue.
+We use $\|\cdot\|_{\max}$
+to denote the largest absolute entry of a real matrix.
+For real-valued functions,
+$\|\cdot\|_\infty$
+denotes the (essential) supremum norm.
+For a bounded set $\cX \subseteq \R$ and $a \geq 0$
+we use $[\cX \pm a]$ to denote the compact interval
+$[\inf \cX - a, \ \sup \cX + a]$.
+For measurable subsets of $\R^d$
+we use $\Leb$ to denote the Lebesgue measure,
+and for finite sets we use $|\cdot|$
+for the cardinality.
+Write $\sum_i$
+for $\sum_{i=1}^n$
+when clear from context.
+Similarly, use $\sum_{i<j}$
+for $\sum_{i=1}^{n-1} \sum_{j=i+1}^n$
+and $\sum_{i<j<r}$
+for $\sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \sum_{r=j+1}^n$.
+
+\subsection{Strong approximation}
+\label{sec:kernel_app_strong_approx}
+
+We give a detailed construction of the
+strong approximation of the dyadic empirical process $\hat f_W$.
+We begin by using the
+K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation
+to obtain a strong approximation for $L_n$
+in Lemma~\ref{lem:kernel_app_strong_approx_Ln}.
+Since $E_n$ is an empirical process of i.n.i.d.\ variables,
+the KMT approximation is not valid.
+Instead we apply a conditional version of
+Yurinskii's coupling to obtain a
+conditional strong approximation for $E_n$
+in Lemma~\ref{lem:kernel_app_conditional_strong_approx_En},
+and then construct an unconditional
+strong approximation for $E_n$
+in Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}.
+These approximations are combined to give a
+strong approximation for $\hat f_W$
+in Theorem~\ref{thm:kernel_app_strong_approx_fW}.
+We do not need to approximate
+the negligible $Q_n$.
+
+This section is largely concerned with
+distributional properties,
+and, as such, will frequently involve
+\emph{copies} of processes.
+We say that $X'$ is a copy of a random variable $X$
+if they have the same distribution,
+though they may be defined on different probability spaces.
+To ensure that all of the joint distributional properties of
+such processes are preserved,
+we also carry over a copy of the latent variables
+$(\bA_n, \bV_n)$
+to the new space.
+
+Many of the technical details regarding
+the copying and embedding of stochastic processes
+are covered by the
+Vorob'ev--Berkes--Philipp theorem,
+which is stated and discussed in
+Lemma~\ref{lem:kernel_app_vbp}.
+In particular, this theorem can be used
+for random vectors or for stochastic processes
+indexed by a compact rectangle in $\R^d$
+with a.s.\ continuous sample paths.
+
+We present more detailed versions of
+Lemmas~\ref{lem:kernel_strong_approx_Ln},
+\ref{lem:kernel_conditional_strong_approx_En},
+and \ref{lem:kernel_unconditional_strong_approx_En}
+as Lemmas~\ref{lem:kernel_app_strong_approx_Ln},
+\ref{lem:kernel_app_conditional_strong_approx_En},
+and \ref{lem:kernel_app_unconditional_strong_approx_En}
+respectively, taking care to describe how copies of
+the stochastic processes are constructed,
+and also providing smoothness properties for
+the resulting sample paths.
+
+\begin{lemma}[Strong approximation of $L_n$]
+\label{lem:kernel_app_strong_approx_Ln}
+
+Suppose that Assumptions
+\ref{ass:kernel_data}
+and
+\ref{ass:kernel_bandwidth} hold.
+For each $n \geq 2$
+there exists
+on some probability space
+a copy of $\big(\bA_n, \bV_n, L_n\big)$,
+denoted $\big(\bA_n', \bV_n', L_n'\big)$,
+and a mean-zero Gaussian process
+$Z^{L\prime}_n$
+indexed on $\cW$ satisfying
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\big| \sqrt{n} L'_n(w) - Z_n^{L\prime}(w)\big|
+> \Du
+\frac{t + C_1 \log n}{\sqrt{n}}
+\right)
+&\leq C_2 e^{-C_3 t},
+\end{align*}
+%
+for some positive constants
+$C_1$, $C_2$, $C_3$,
+and for all $t > 0$.
+By integration of tail probabilities,
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big| \sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\big|
+\right]
+&\lesssim
+\frac{\Du \log n}{\sqrt{n}}.
+\end{align*}
+%
+Further,
+$Z_n^{L\prime}$ has the same covariance structure as
+$\sqrt{n} L_n'$ in the sense that for all $w, w' \in \cW$,
+%
+\begin{align*}
+\E\left[
+Z_n^{L\prime}(w)
+Z_n^{L\prime}(w')
+\right]
+&=
+n
+\E\left[
+L_n'(w)
+L_n'(w')
+\right].
+\end{align*}
+%
+It also satisfies the following
+trajectory regularity property
+for any $\delta_n \in (0, 1/2]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|
+Z_n^{L\prime}(w)
+- Z_n^{L\prime}(w')
+\big|
+\right]
+&\lesssim
+\Du
+\delta_n \sqrt{\log 1/\delta_n},
+\end{align*}
+%
+and has continuous trajectories.
+The process $Z_n^{L\prime}$
+is a function only of $\bA_n'$
+and some random noise
+which is independent of $(\bA_n', \bV_n')$.
+
+\end{lemma}
+
+\begin{lemma}[Conditional strong approximation of $E_n$]
+\label{lem:kernel_app_conditional_strong_approx_En}
+
+Suppose Assumptions
+\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold.
+For $n \geq 2$
+and $t_n > 0$ with $\left|\log t_n\right| \lesssim \log n$,
+there exists on some probability space
+a copy of
+$\big(\bA_n, \bV_n, E_n\big)$,
+denoted
+$\big(\bA_n', \bV_n', E_n'\big)$,
+and a process
+$\tilde Z^{E\prime}_n$
+which is Gaussian conditional on $\bA_n'$
+and mean-zero conditional on $\bA_n'$,
+satisfying
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\big|
+\sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w)
+\big|
+> t_n
+\Bigm\vert \bA_n'
+\right)
+&\leq
+C_1
+t_n^{-2}
+n^{-1/2}
+h^{-3/4}
+(\log n)^{3/4},
+\end{align*}
+$\bA_n'$-almost surely
+for some constant $C_1 > 0$.
+Setting $t_n = n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$
+for any sequence $R_n \to \infty$
+and taking an expectation gives
+%
+\begin{align*}
+\sup_{w \in \cW}
+\big|
+\sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w)
+\big|
+&\lesssim_\P
+n^{-1/4}
+h^{-3/8} (\log n)^{3/8} R_n.
+\end{align*}
+%
+Further,
+$\tilde Z_n^{E\prime}$ has the same conditional covariance as
+$\sqrt{n^2h} E_n'$ in that for all $w, w' \in \cW$,
+%
+\begin{align*}
+\E\left[
+\tilde Z_n^{E\prime}(w)
+\tilde Z_n^{E\prime}(w')
+\bigm\vert \bA_n'
+\right]
+&=
+n^2h
+\E\left[
+E_n'(w)
+E_n'(w')
+\bigm\vert \bA_n'
+\right].
+\end{align*}
+%
+It also satisfies the following
+trajectory regularity property
+for any $\delta_n \in (0, 1/(2h)]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|
+\tilde Z_n^{E\prime}(w)
+- \tilde Z_n^{E\prime}(w')
+\big|
+\right]
+&\lesssim
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h\delta_n}},
+\end{align*}
+%
+and has continuous trajectories.
+
+\end{lemma}
+
+\begin{lemma}[Unconditional strong approximation of $E_n$]
+\label{lem:kernel_app_unconditional_strong_approx_En}
+
+Suppose Assumptions
+\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold.
+Let $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$
+be defined as in
+Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}.
+For each $n \geq 2$
+there exists
+(on some probability space)
+a copy of
+$\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$,
+denoted
+$\big(\bA_n'', \bV_n'', \tilde Z_n^{E\dprime}\big)$,
+and a centered
+Gaussian process
+$Z^{E\dprime}_n$
+satisfying
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\right]
+&\lesssim
+n^{-1/6} (\log n)^{2/3}.
+\end{align*}
+%
+Further,
+$Z_n^{E\dprime}$ has the same
+(unconditional) covariance structure as
+$\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$
+in the sense that for all $w, w' \in \cW$,
+%
+\begin{align*}
+\E\left[
+Z_n^{E\dprime}(w)
+Z_n^{E\dprime}(w')
+\right]
+&=
+\E\left[
+\tilde Z_n^{E\dprime}(w)
+\tilde Z_n^{E\dprime}(w')
+\right]
+=
+n^2h \,
+\E\left[
+E_n(w)
+E_n(w')
+\right].
+\end{align*}
+%
+It also satisfies the following
+trajectory regularity property
+for any $\delta_n \in (0, 1/(2h)]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|
+Z_n^{E\dprime}(w)
+- Z_n^{E\dprime}(w')
+\big|
+\right]
+&\lesssim
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h\delta_n}}.
+\end{align*}
+%
+Finally, $Z_n^{E\dprime}$ is independent of $\bA_n''$
+and has continuous trajectories.
+
+\end{lemma}
+
+We combine these strong approximations to deduce a coupling for $\hat f_W$ in
+Theorem~\ref{thm:kernel_app_strong_approx_fW}, taking care with independence
+to ensure the approximating processes are jointly Gaussian.
+
+\begin{theorem}[Strong approximation of $\hat f_W$]
+\label{thm:kernel_app_strong_approx_fW}
+
+Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
+hold. For each $n \geq 2$ and any sequence $R_n \to \infty$ there exists on
+some probability space a centered Gaussian process $Z_n^{f\prime}$ and a copy
+of $\hat f_W$, denoted $\hat f_W'$, satisfying
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\Big|
+\hat f_W'(w) - \E[\hat f_W'(w)]
+- Z_n^{f\prime}(w)
+\Big| \\
+&\quad\lesssim_\P
+n^{-1} \log n
++ n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-7/6} h^{-1/2} (\log n)^{2/3}.
+\end{align*}
+%
+Further, $Z_n^{f\prime}$ has the same covariance
+structure as
+$\hat f_W'(w)$
+in the sense that for all
+$w, w' \in \cW$,
+%
+\begin{align*}
+\E\big[Z_n^{f\prime}(w) Z_n^{f\prime}(w')\big]
+&=
+\Cov\Big[
+\hat f_W'(w),
+\hat f_W'(w')
+\Big]
+= \Sigma_n(w,w').
+\end{align*}
+%
+It has continuous trajectories satisfying the following regularity property
+for any $\delta_n \in (0, 1/2]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\Big|
+Z_n^{f\prime}(w)
+- Z_n^{f\prime}(w')
+\Big|
+\right]
+&\lesssim
+\frac{\Du}{\sqrt n} \delta_n
+\sqrt{\log \frac{1}{\delta_n}}
++ \frac{1}{\sqrt{n^2h}}
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h\delta_n}}.
+\end{align*}
+%
+\end{theorem}
+
+The main result Theorem~\ref{thm:kernel_strong_approx_Tn}
+now follows easily using Theorem~\ref{thm:kernel_app_strong_approx_fW},
+the bias bound from Theorem~\ref{thm:kernel_bias},
+and properties of $\Sigma_n$ established in
+Lemma~\ref{lem:kernel_variance_bounds}.
+
+\subsection{Covariance estimation}
+\label{sec:kernel_app_covariance_estimation}
+
+In this section we carefully construct a consistent estimator for the
+covariance function $\Sigma_n$. Firstly, we characterize $\Sigma_n$ in
+Lemma~\ref{lem:kernel_app_covariance_structure}. In
+Lemma~\ref{lem:kernel_app_covariance_estimation}
+we define the estimator and demonstrate that it converges in probability in a
+suitable sense. In Lemma~\ref{lem:kernel_app_alternative_covariance_estimator}
+we give an
+alternative representation which is more amenable to computation.
+
+\begin{lemma}[Covariance structure]
+\label{lem:kernel_app_covariance_structure}
+
+Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
+hold. Then $\Sigma_n$, as defined in Section~\ref{sec:kernel_degeneracy},
+admits the following representations,
+where $1 \leq i < j < r \leq n$.
+%
+\begin{align*}
+\Sigma_n(w,w')
+&=
+\frac{2}{n(n-1)}
+\,\Cov\!\big[
+k_h(W_{i j},w),
+k_h(W_{i j},w')
+\big]
++
+\frac{4(n-2)}{n(n-1)}
+\,\Cov\!\big[
+k_h(W_{i j},w),
+k_h(W_{i r},w')
+\big] \\
+&=
+\frac{2}{n(n-1)}
+\,\Cov\!\big[
+k_h(W_{i j},w),
+k_h(W_{i j},w')
+\big] \\
+&\quad+
+\frac{4(n-2)}{n(n-1)}
+\,\Cov\!\big[
+\E[k_h(W_{i j},w) \mid A_i],
+\E[k_h(W_{i j},w') \mid A_i]
+\big],
+\end{align*}
+%
+\end{lemma}
+
+\begin{lemma}[Covariance estimation]
+\label{lem:kernel_app_covariance_estimation}
+
+Grant Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth},
+and suppose $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define
+%
+\begin{align*}
+S_{i j r}(w,w')
+&=
+\frac{1}{6}
+\Big(
+k_h(W_{i j},w)
+k_h(W_{i r},w')
++ k_h(W_{i j},w)
+k_h(W_{jr},w')
++ k_h(W_{i r},w)
+k_h(W_{i j},w') \\
+&\quad+
+k_h(W_{i r},w)
+k_h(W_{jr},w')
++ k_h(W_{jr},w)
+k_h(W_{i j},w')
++ k_h(W_{jr},w)
+k_h(W_{i r},w')
+\Big), \\
+\hat \Sigma_n(w,w')
+&=
+\frac{4}{n^2(n-1)^2}
+\sum_{i<j}
+k_h(W_{i j},w)
+k_h(W_{i j},w')
++
+\frac{24}{n^2(n-1)^2}
+\sum_{i<j<r}
+S_{i j r}(w,w') \\
+&\quad-
+\frac{4n-6}{n(n-1)}
+\hat f_W(w)
+\hat f_W(w').
+\end{align*}
+%
+Then $\hat \Sigma_n$
+is uniformly entrywise-consistent in the sense that
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+&\lesssim_\P
+\frac{\sqrt{\log n}}{n}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Alternative covariance estimator representation]
+\label{lem:kernel_app_alternative_covariance_estimator}
+
+Suppose that Assumptions~\ref{ass:kernel_data}
+and~\ref{ass:kernel_bandwidth} hold,
+and let $\hat \Sigma_n$
+be the covariance estimator defined
+in Lemma~\ref{lem:kernel_app_covariance_estimation}.
+Then the following alternative representation
+for $\hat \Sigma_n$ holds,
+which may be easier to compute
+as it does not involve any triple summations
+over the data.
+Let $S_i(w) = \frac{1}{n-1}
+\sum_{j = 1}^{i-1} k_h(W_{j i}, w)
++ \frac{1}{n-1} \sum_{j = i+1}^n k_h(W_{i j}, w)$
+estimate $\E[k_h(W_{i j},w) \mid A_i]$.
+%
+\begin{align*}
+\hat \Sigma_n(w,w')
+&=
+\frac{4}{n^2}
+\sum_{i=1}^n
+S_i(w) S_i(w')
+- \frac{4}{n^2(n-1)^2}
+\sum_{i<j}
+k_h(W_{i j},w)
+k_h(W_{i j},w') \\
+&\quad-
+\frac{4n-6}{n(n-1)}
+\hat f_W(w)
+\hat f_W(w').
+\end{align*}
+%
+\end{lemma}
+
+We show how to obtain a positive semi-definite estimator $\hat \Sigma_n^+$
+which is uniformly entrywise-consistent for $\Sigma_n$. Define $\hat \Sigma_n$
+as in Lemma~\ref{lem:kernel_app_covariance_estimation} and consider the
+following
+optimization problem over bivariate functions.
+%
+\begin{equation}
+\label{eq:kernel_app_sdp}
+%
+\begin{aligned}
+\minimize
+\qquad
+& \sup_{w,w' \in \cW}
+\left|
+\frac{M(w,w') - \hat\Sigma_n(w,w')}
+{\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
+\right|
+\quad \textup{ over } M: \cW \times \cW \to \R
+\\
+\subjectto
+\qquad
+& M \textup{ is symmetric and positive semi-definite}, \\
+& \big|M(w,w') - M(w, w'')\big|
+\leq \frac{4}{n h^3}
+C_\rk C_\rL
+|w'-w''|
+\textup{ for all }
+w, w', w'' \in \cW.
+\end{aligned}
+%
+\end{equation}
+
+\begin{lemma}[Consistency of $\hat \Sigma_n^+$]
+\label{lem:kernel_app_sdp}
+
+Suppose that Assumptions~\ref{ass:kernel_data}
+and~\ref{ass:kernel_bandwidth} hold, and that
+$n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
+Then the optimization problem \eqref{eq:kernel_app_sdp}
+has an approximately optimal solution $\hat\Sigma_n^+$
+which is uniformly entrywise-consistent
+for $\Sigma_n$ in the sense that
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+&\lesssim_\P
+\frac{\sqrt{\log n}}{n}.
+\end{align*}
+
+\end{lemma}
+
+The optimization problem \eqref{eq:kernel_app_sdp} is stated for functions
+rather than
+matrices so is infinite-dimensional. However, when restricting to finite-size
+matrices, Lemma~\ref{lem:kernel_app_sdp} still holds and does not depend on the
+size
+of the matrices. Furthermore, the problem then becomes a semi-definite program
+and so can be solved to arbitrary precision in polynomial time in the size of
+the matrices \citep{laurent2005semidefinite}.
+
+The Lipschitz-type constraint in the optimization problem
+\eqref{eq:kernel_app_sdp}
+ensures that $\hat \Sigma_n^+$ is sufficiently smooth and is a technicality
+required by some of the later proofs. In practice this constraint is readily
+verified.
+
+\begin{lemma}[Positive semi-definite variance estimator bounds]
+\label{lem:kernel_app_variance_estimator_bounds}
+
+Suppose that Assumptions~\ref{ass:kernel_data}
+and~\ref{ass:kernel_bandwidth} hold, and that
+$n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
+Then $\hat \Sigma_n^+(w,w) \geq 0$
+almost surely for all $w \in \cW$ and
+%
+\begin{align*}
+\frac{\Dl^2}{n} + \frac{1}{n^2h}
+&\lesssim_\P
+\inf_{w \in \cW} \hat \Sigma_n^+(w,w)
+\leq
+\sup_{w \in \cW} \hat \Sigma_n^+(w,w)
+\lesssim_\P
+\frac{\Du^2}{n} + \frac{1}{n^2h}.
+\end{align*}
+
+\end{lemma}
+
+\subsection{Feasible uniform confidence bands}
+
+We use the strong approximation derived in
+Section~\ref{sec:kernel_app_strong_approx} and the
+positive semi-definite covariance estimator introduced in
+Section~\ref{sec:kernel_app_covariance_estimation} to construct feasible
+uniform
+confidence bands. We drop the prime notation for copies of processes
+in the interest of clarity.
+
+\begin{lemma}[Proximity of the standardized and studentized $t$-statistics]
+\label{lem:kernel_app_studentized_t_statistic}
+
+Let Assumptions \ref{ass:kernel_data} and
+\ref{ass:kernel_bandwidth} hold, and suppose that
+$n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
+Define for $w \in \cW$
+the Studentized $t$-statistic process
+%
+\begin{align*}
+\hat T_n(w) = \frac{\hat f_W(w) - f_W(w)}
+{\sqrt{\hat\Sigma_n^+(w,w)}}.
+\end{align*}
+%
+Then
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \hat T_n(w) - T_n(w) \right|
+&\lesssim_\P
+\sqrt{\frac{\log n}{n}}
+\left(
+\sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
+{\Dl + 1/\sqrt{n h}}
+\right)
+\frac{1}{\Dl + 1/\sqrt{n h}}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Feasible Gaussian approximation
+of the infeasible Gaussian process]
+\label{lem:kernel_app_distributional_approx_feasible_gaussian}
+
+Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}
+hold, and suppose that
+$n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$.
+Define a process $\hat Z_n^T(w)$ which,
+conditional on the data $\bW_n$,
+is conditionally mean-zero and
+conditionally Gaussian, and whose
+conditional covariance structure is
+%
+\begin{align*}
+\E\left[
+\hat Z_n^T(w) \hat Z_n^T(w')
+\bigm| \bW_n \right]
+&=
+\frac{\hat \Sigma_n^+(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\end{align*}
+%
+Then the following conditional
+Kolmogorov--Smirnov result holds.
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left| Z_n^T(w) \right|
+\leq t
+\right)
+- \P\left(
+\sup_{w \in \cW}
+\left| \hat Z_n^T(w) \right|
+\leq t
+\biggm\vert \bW_n
+\right)
+\right|
+&\lesssim_\P
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Feasible Gaussian approximation of the studentized $t$-statistic]
+\label{lem:kernel_app_feasible_gaussian_approx}
+
+Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}
+and \ref{ass:kernel_rates} hold, and suppose that $f_W(w) > 0$ on $\cW$.
+Then
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
+- \P\left(
+\sup_{w \in \cW}
+\left| \hat Z_n^T(w) \right|
+\leq t
+\Bigm\vert \bW_n
+\right)
+\right|
+&\ll_\P
+1.
+\end{align*}
+
+\end{lemma}
+
+These intermediate lemmas can be used to establish the valid and feasible
+uniform confidence bands presented in Theorem~\ref{thm:kernel_ucb} in the main
+text. See Section~\ref{sec:kernel_app_proofs} for details.
+
+\subsection{Counterfactual dyadic density estimation}
+
+In this section we give a detailed analysis of the counterfactual
+estimator of Section~\ref{sec:kernel_counterfactual}.
+We begin with an assumption describing the counterfactual setup.
+
+\begin{assumption}[Counterfactual data generation]
+\label{ass:kernel_app_counterfactual}
+
+For each $r \in \{0,1\}$,
+let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be as in
+Assumption~\ref{ass:kernel_data}.
+Let $X_i^r$ be finitely-supported variables,
+setting $\bX_n^r = (X_1^r, \ldots, X_n^r)$.
+Suppose that $(A_i^r, X_i^r)$ are
+independent over $1 \leq i \leq n$
+and that $\bX_n^r$ is independent of $\bV_n^r$.
+Assume that $W_{i j}^r \mid X_i^r, X_j^r$
+has a Lebesgue density
+$f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$
+and that $X_i^r$ has positive
+probability mass function
+$p_X^r(x)$ on a common support $\cX$.
+Suppose that
+$(\bA_n^0, \bV_n^0, \bX_n^0)$
+and $(\bA_n^1, \bV_n^1, \bX_n^1)$
+are independent.
+
+\end{assumption}
+
+The counterfactual density of $W_{i j}$ in population $1$ had $X_i, X_j$
+followed population $0$ is
+%
+\begin{align*}
+f_W^{1 \triangleright 0}(w)
+&=
+\E\left[
+f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big)
+\right]
+= \sum_{x_1 \in \cX}
+\sum_{x_2 \in \cX}
+f_{W \mid XX}^{1}(w \mid x_1, x_2)
+\psi(x_1)
+\psi(x_2)
+p_X^{1}(x_1)
+p_X^{1}(x_2),
+\end{align*}
+%
+with $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$.
+Define the counterfactual dyadic kernel density estimator
+%
+\begin{align*}
+\hat f_W^{1 \triangleright 0}(w)
+&=
+\frac{2}{n(n-1)}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^n
+\hat \psi(X_i^1)
+\hat \psi(X_j^1)
+k_h(W_{i j}^1, w),
+\end{align*}
+%
+where
+$\hat\psi(x) = \I\{\hat p_X^{1}(x) > 0\}\hat p_X^{0}(x) / \hat p_X^{1}(x)$
+and $\hat p_X^{r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$.
+Since $p_X^r(x) > 0$,
+%
+\begin{align*}
+\hat\psi(x) - \psi(x)
+&=
+\frac{\hat p_X^{0}(x) - p_X^0(x)}{p_X^1(x)}
+- \frac{p_X^0(x)}{p_X^1(x)}
+\frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \\
+&\quad+
+\frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)}
+\frac{\hat p_X^{1}(x) p_X^0(x) - \hat p_X^{0}(x)p_X^1(x)}
+{\hat p_X^{1}(x) p_X^1(x)} \\
+&=
+\frac{1}{n}
+\sum_{r=1}^n \kappa(X_r^0, X_r^1, x)
++ O_\P\left(\frac{1}{n}\right)
+\end{align*}
+%
+is an asymptotic linear representation where
+%
+\begin{align*}
+\kappa(X_i^0, X_i^1, x)
+&=
+\frac{\I\{X_i^0 = x\} - p_X^0(x)}{p_X^1(x)}
+- \frac{p_X^0(x)}{p_X^1(x)}
+\frac{\I\{X_i^1 = x\} - p_X^1(x)}{p_X^1(x)}
+\end{align*}
+%
+satisfies
+$\E[\kappa(X_i^0, X_i^1, x)] = 0$.
+We now establish uniform consistency and feasible strong
+approximation results for the counterfactual density estimator.
+
+\begin{lemma}[Bias of $\hat f_W^{1 \triangleright 0}$]
+\label{lem:kernel_app_counterfactual_bias}
+
+Suppose that Assumptions~\ref{ass:kernel_data},
+\ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold.
+Then
+%
+\begin{align*}
+\sup_{w \in \cW}
+\big|
+\E\big[\hat f_W^{1 \triangleright 0}(w)\big]
+- f_W^{1 \triangleright 0}(w)
+\big|
+\lesssim
+h^{p \wedge \beta} + \frac{1}{n}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Hoeffding-type decomposition for
+$\hat f_W^{1 \triangleright 0}$]
+\label{lem:kernel_app_counterfactual_hoeffding}
+
+Suppose that Assumptions~\ref{ass:kernel_data},
+\ref{ass:kernel_bandwidth}, and
+\ref{ass:kernel_app_counterfactual} hold.
+With $k_{i j} = k_h(W_{i j}^1, w)$,
+$\kappa_{r i} = \kappa(X_r^0, X_r^1, X_i^1)$
+and $\psi_i = \psi(X_i^1)$, define the projections
+%
+\begin{align*}
+u
+&=
+\E\left[
+k_{i j}
+\psi_i
+\psi_j
+\right], \\
+u_i
+&=
+\frac{2}{3} \psi_i
+\E\left[
+k_{i j}
+\psi_j
+\mid A_i^1 \right]
++
+\frac{2}{3} \E\left[
+k_{jr}
+\psi_j \kappa_{i r}
+\mid X_i^0, X_i^1 \right]
+- \frac{2}{3} u, \\
+u_{i j}
+&=
+\frac{1}{3}
+\psi_i
+\psi_j
+\E\left[
+k_{i j}
+\mid A_i^1, A_j^1 \right]
++
+\frac{1}{3}
+\psi_i
+\E\left[
+k_{i r} \psi_r
+\mid A_i^1 \right]
++
+\frac{1}{3}
+\psi_i
+\E\left[
+k_{i r} \kappa_{jr}
+\mid A_i^1, X_j^0, X_j^1 \right] \\
+&\quad+
+\frac{1}{3}
+\kappa_{j i}
+\E\left[
+k_{i r} \psi_r
+\mid A_i^1 \right]
++ \frac{1}{3}
+\psi_j
+\E\left[
+k_{jr} \psi_r
+\mid A_j^1 \right]
++
+\frac{1}{3}
+\psi_j
+\E\left[
+k_{jr} \kappa_{i r}
+\mid X_i^0, X_i^1, A_j^1 \right] \\
+&\quad+
+\frac{1}{3}
+\kappa_{i j}
+\E\left[
+k_{jr} \psi_r
+\mid A_j^1 \right]
+- u_i - u_j + u, \\
+u_{i j r}
+&=
+\frac{1}{3}
+\psi_i \psi_j
+\E\left[
+k_{i j}
+\mid A_i^1, A_j^1 \right]
++
+\frac{1}{3}
+\psi_i \kappa_{r j}
+\E\left[
+k_{i j}
+\mid A_i^1, A_j^1 \right]
++
+\frac{1}{3}
+\psi_j \kappa_{r i}
+\E\left[
+k_{i j}
+\mid A_i^1, A_j^1 \right] \\
+&\quad+
+\frac{1}{3}
+\psi_i \psi_r
+\E\left[
+k_{i r}
+\mid A_i^1, A_r^1 \right]
++ \frac{1}{3}
+\psi_i \kappa_{jr}
+\E\left[
+k_{i r}
+\mid A_i^1, A_r^1 \right]
++
+\frac{1}{3}
+\psi_r \kappa_{j i}
+\E\left[
+k_{i r}
+\mid A_i^1, A_r^1 \right] \\
+&\quad+
+\frac{1}{3}
+\psi_j \psi_r
+\E\left[
+k_{jr}
+\mid A_j^1, A_r^1 \right]
++ \frac{1}{3}
+\psi_j \kappa_{i r}
+\E\left[
+k_{jr}
+\mid A_j^1, A_r^1 \right]
++
+\frac{1}{3}
+\psi_r \kappa_{i j}
+\E\left[
+k_{jr}
+\mid A_j^1, A_r^1 \right] \\
+&\quad-
+u_{i j} - u_{i r} - u_{jr}
++ u_i + u_j + u_r
+- u, \\
+v_{i j r}
+&=
+\frac{1}{3}
+k_{i j} \big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \big)
++ \frac{1}{3}
+k_{i r} \big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \big) \\
+&\quad+
+\frac{1}{3}
+k_{jr} \big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \big).
+\end{align*}
+%
+With $l_i^{1 \triangleright 0}(w) = u_i$
+and $e_{i j r}^{1 \triangleright 0}(w) = v_{i j r} - u_{i j r}$,
+set
+%
+\begin{align*}
+L_n^{1 \triangleright 0}(w)
+&=
+\frac{3}{n} \sum_{i=1}^n
+l_i^{1 \triangleright 0}(w)
+&\text{and} &
+&E_n^{1 \triangleright 0}(w)
+&=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{j=i+1}^{n-1}
+\sum_{r=i+1}^n
+e_{i j r}^{1 \triangleright 0}(w).
+\end{align*}
+%
+Then the following Hoeffding-type decomposition holds,
+where $O_\P(1/n)$ is uniform in $w \in \cW$.
+%
+\begin{align*}
+\hat f_W^{1 \triangleright 0}(w)
+= \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
++ L_n^{1 \triangleright 0}(w)
++ E_n^{1 \triangleright 0}(w)
++ O_\P\left( \frac{1}{n} \right).
+\end{align*}
+%
+Further,
+the stochastic processes
+$L_n^{1 \triangleright 0}$
+and $E_n^{1 \triangleright 0}$
+are mean-zero and orthogonal
+in $L^2(\P)$.
+Define the upper and lower degeneracy constants as
+%
+\begin{align*}
+\Du^{1 \triangleright 0}
+&=
+\limsup_{n \to \infty}
+\sup_{w \in \cW}
+\Var\big[
+l_i^{1 \triangleright 0}(w)
+\big]^{1/2}
+&\text{and}&
+&
+\Dl^{1 \triangleright 0}
+&=
+\liminf_{n \to \infty}
+\inf_{w \in \cW}
+\Var\big[
+l_i^{1 \triangleright 0}(w)
+\big]^{1/2}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Uniform consistency of $\hat f_W^{1 \triangleright 0}$]
+\label{lem:kernel_app_counterfactual_uniform_consistency}
+
+Suppose that Assumptions~\ref{ass:kernel_data},
+\ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold.
+Then
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big|\hat f_W^{1 \triangleright 0}(w)
+- f_W^{1 \triangleright 0}(w)
+\right]
+&\lesssim
+h^{p \wedge \beta}
++ \frac{\Du^{1 \triangleright 0}}{\sqrt n}
++ \sqrt{\frac{\log n}{n^2h}}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Strong approximation of $\hat f_W^{1 \triangleright 0}$]
+\label{lem:kernel_app_counterfactual_sa}
+
+On an appropriately enlarged probability space
+and for any sequence $R_n \to \infty$,
+there exists a mean-zero Gaussian process
+$Z_n^{f, 1 \triangleright 0}$
+with the same covariance structure as
+$\hat f_W^{1 \triangleright 0}(w)$ satisfying
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left|
+\hat f_W^{1 \triangleright 0}(w)
+- \E\big[\hat f_W^{1 \triangleright 0}(w)\big]
+- Z_n^{f, 1 \triangleright 0}(w)
+\right| \\
+&\quad\lesssim_\P
+n^{-1} \log n
++ n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-7/6} h^{-1/2} (\log n)^{2/3}.
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Counterfactual covariance structure]
+\label{lem:kernel_app_counterfactual_covariance_structure}
+
+Writing $k_{i j}'$ for $k_h(W_{i j}^1, w')$ etc.,
+the counterfactual covariance function is
+%
+\begin{align*}
+&\Sigma_n^{1 \triangleright 0}(w,w')
+= \Cov\left[
+\hat f_W^{1 \triangleright 0}(w),
+\hat f_W^{1 \triangleright 0}(w')
+\right] \\
+&\quad=
+\frac{4}{n}
+\E\left[
+\Big(
+\psi_i
+\E\big[
+k_{i j} \psi_j
+\mid A_i^1
+\big]
++ \E\left[
+k_{r j} \psi_r
+\kappa_{i j}
+\mid X_i^0, X_i^1
+\right]
+\Big)
+\right. \\
+&\left.
+\qquad\qquad\quad
+\times
+\Big(
+\psi_i
+\E\big[
+k_{i j}' \psi_j
+\mid A_i^1
+\big]
++ \E\left[
+k_{r j}' \psi_r \kappa_{i j}
+\mid X_i^0, X_i^1
+\right]
+\Big)
+\right] \\
+&\qquad+
+\frac{2}{n^2}
+\E\left[
+k_{i j} k_{i j}'
+\psi_i^2 \psi_j^2
+\right]
+- \frac{4}{n}
+\E\left[
+k_{i j} \psi_i \psi_j
+\right]
+\E\left[
+k_{i j}' \psi_i \psi_j
+\right]
++ O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right).
+\end{align*}
+
+\end{lemma}
+
+\begin{lemma}[Gaussian approximation
+of the standardized counterfactual $t$-statistic]
+\label{lem:kernel_app_counterfactual_infeasible_t_statistic}
+
+Let Assumptions \ref{ass:kernel_data},
+\ref{ass:kernel_bandwidth}, and
+\ref{ass:kernel_app_counterfactual}
+hold, and suppose
+$f_W^{1 \triangleright 0}(w) > 0$ on $\cW$.
+Define
+%
+\begin{align*}
+T_n^{1 \triangleright 0}(w)
+&= \frac{\hat f_W^{1 \triangleright 0}(w)
+- f_W^{1 \triangleright 0}(w)}
+{\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}
+\quad\text{and}\quad
+Z_n^{T, 1 \triangleright 0}(w)
+= \frac{Z_n^{f, 1 \triangleright 0}(w)}
+{\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}.
+\end{align*}
+%
+Then with $R_n \to \infty$ as in Lemma~\ref{lem:kernel_app_counterfactual_sa},
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left|
+T_n^{1 \triangleright 0}(w) - Z_n^{T, 1 \triangleright 0}(w)
+\right| \\
+&\quad\lesssim_\P
+\frac{
+n^{-1/2} \log n
++ n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-2/3} h^{-1/2} (\log n)^{2/3}
++ n^{1/2} h^{p \wedge \beta}}
+{\Dl^{1 \triangleright 0} + 1/\sqrt{n h}}.
+\end{align*}
+
+\end{lemma}
+
+\begin{theorem}[Infeasible counterfactual uniform confidence bands]
+\label{thm:kernel_app_counterfactual_infeasible_ucb}
+
+Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth},
+\ref{ass:kernel_rates}, and \ref{ass:kernel_app_counterfactual}
+hold and suppose that $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$.
+Let $\alpha \in (0,1)$ be a confidence level
+and define $q^{1 \triangleright 0}_{1-\alpha}$ as the quantile
+satisfying
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\left| Z_n^{T,1 \triangleright 0}(w) \right|
+\leq q^{1 \triangleright 0}_{1-\alpha}
+\right)
+&=
+1 - \alpha.
+\end{align*}
+%
+Then
+%
+\begin{align*}
+\P\left(
+f_W^{1 \triangleright 0}(w)
+\in
+\left[
+\hat f_W^{1 \triangleright 0}(w)
+\pm
+q^{1 \triangleright 0}_{1-\alpha}
+\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}
+\, \right]
+\, \textup{for all }
+w \in \cW
+\right)
+\to 1 - \alpha.
+\end{align*}
+\end{theorem}
+%
+We propose an estimator for the counterfactual covariance function
+$\Sigma_n^{1 \triangleright 0}$. First let
+%
+\begin{align*}
+\hat\kappa(X_i^0, X_i^1, x)
+&=
+\frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)}
+- \frac{\hat p_X^0(x)}{\hat p_X^1(x)}
+\frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)},
+\end{align*}
+%
+and define the leave-out conditional expectation estimators
+%
+\begin{align*}
+S_i^{1 \triangleright 0}(w)
+&=
+\hat\E\left[
+k_h(W_{i j}^1,w) \psi(X_j^1) \mid A_i^1
+\right] \\
+&=
+\frac{1}{n-1}
+\left(
+\sum_{j=1}^{i-1}
+k_h(W_{j i}^1,w) \hat\psi(X_j^1)
++ \sum_{j=i+1}^n
+k_h(W_{i j}^1,w) \hat\psi(X_j^1)
+\right), \\
+\tilde S_i^{1 \triangleright 0}(w)
+&=
+\hat\E\left[
+k_h(W_{r j}^1,w) \psi(X_r^1)
+\kappa(X_i^0, X_i^1, X_j^1) \mid X_i^0, X_i^1
+\right] \\
+&=
+\frac{1}{n-1}
+\sum_{j=1}^n
+\I\{j \neq i\}
+\hat\kappa(X_i^0, X_i^1, X_j^1)
+S_j^{1 \triangleright 0}(w).
+\end{align*}
+%
+Then set
+%
+\begin{align*}
+\hat\Sigma_n^{1 \triangleright 0}(w,w')
+&=
+\frac{4}{n^2}
+\sum_{i=1}^n
+\left(
+\hat\psi(X_i^1)
+S_i^{1 \triangleright 0}(w)
++ \tilde S_i^{1 \triangleright 0}(w)
+\right)
+\left(
+\hat\psi(X_i^1)
+S_i^{1 \triangleright 0}(w')
++ \tilde S_i^{1 \triangleright 0}(w')
+\right) \\
+&\quad-
+\frac{4}{n^3(n-1)}
+\sum_{i<j}
+k_h(W_{i j}^1, w)
+k_h(W_{i j}^1, w')
+\hat\psi(X_i^1)^2
+\hat\psi(X_j^1)^2
+- \frac{4}{n}
+\hat f_W^{1 \triangleright 0}(w)
+\hat f_W^{1 \triangleright 0}(w').
+\end{align*}
+%
+We use a positive semi-definite approximation to
+$\hat\Sigma_n^{1 \triangleright 0}$, denoted by
+$\hat\Sigma_n^{+, 1 \triangleright 0}$,
+and omit the proof of consistency for brevity.
+To construct feasible uniform confidence bands,
+define a process $\hat Z_n^{T, 1 \triangleright 0}(w)$ which,
+conditional on the data $\bW_n^1$, $\bX_n^0$, and $\bX_n^1$
+is conditionally mean-zero and conditionally Gaussian, and whose
+conditional covariance structure is
+%
+\begin{align*}
+\E\left[
+\hat Z_n^{T, 1 \triangleright 0}(w)
+\hat Z_n^{T, 1 \triangleright 0}(w')
+\bigm| \bW_n^1, \bX_n^0, \bX_n^1 \right]
+&=
+\frac{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w')}
+{\sqrt{\hat \Sigma_n^{+, 1 \triangleright 0}(w,w)
+\hat \Sigma_n^{+, 1 \triangleright 0}(w',w')}}.
+\end{align*}
+%
+Let $\alpha \in (0,1)$ be a confidence level and define
+$\hat q_{1-\alpha}^{1 \triangleright 0}$
+as the conditional quantile satisfying
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\left| \hat Z_n^{T, 1 \triangleright 0}(w) \right|
+\leq \hat q_{1-\alpha}^{1 \triangleright 0}
+\Bigm\vert \bW_n^1, \bX_n^0, \bX_n^1
+\right)
+&=
+1 - \alpha.
+\end{align*}
+%
+Then assuming that the covariance estimator is appropriately consistent,
+we have that
+%
+\begin{align*}
+\P\left(
+f_W^{1 \triangleright 0}(w)
+\in
+\left[
+\hat f_W^{1 \triangleright 0}(w)
+\pm
+\hat q^{1 \triangleright 0}_{1-\alpha}
+\sqrt{\hat\Sigma_n^{+, 1 \triangleright 0}(w,w)}
+\,\right]
+\,\textup{for all }
+w \in \cW
+\right)
+\to 1 - \alpha.
+\end{align*}
+
+\section{Technical lemmas}
+\label{sec:kernel_app_technical}
+
+We present some lemmas which provide the technical foundations for our main
+results. These lemmas are stated in as much generality as is reasonable,
+and may be of independent interest.
+
+\subsection{Maximal inequalities for i.n.i.d.\ empirical processes}
+
+Firstly, we provide a maximal inequality
+for empirical processes of
+independent but not necessarily identically distributed
+(i.n.i.d.)
+random variables,
+indexed by a class of functions.
+This result is an extension
+of Theorem~5.2 from \citet{chernozhukov2014gaussian},
+which only covers i.i.d.\ random variables,
+and is proven in the same manner.
+Such a result is useful in the study of dyadic data
+because when conditioning on latent variables,
+we may encounter
+random variables which are conditionally independent
+but which do not necessarily follow the same
+conditional distribution.
+
+\begin{lemma}[A maximal inequality for i.n.i.d.\ empirical processes]
+\label{lem:kernel_app_maximal_entropy}
+
+Let $X_1, \dots, X_n$
+be independent but not necessarily identically distributed
+(i.n.i.d.)
+random variables taking values in a
+measurable space $(S,\cS)$.
+Denote the joint distribution of
+$X_1,\ldots,X_n$ by $\P$
+and the marginal distribution of
+$X_i$ by $\P_i$,
+and let $\bar\P = n^{-1} \sum_i \P_i$.
+Let $\cF$ be a class of Borel measurable functions
+from $S$ to $\R$
+which is pointwise measurable
+(i.e.\ it contains a countable subclass which
+is dense under pointwise convergence).
+Let $F$ be a strictly positive
+measurable envelope function for $\cF$
+(i.e.\ $|f(s)| \leq |F(s)|$ for all $f \in \cF$
+and $s \in S$).
+For a distribution $\Q$ and some $q \geq 1$,
+define the $(\Q,q)$-norm of $f \in \cF$ as
+$\|f\|_{\Q,q}^q = \E_{X \sim \Q}[f(X)^q]$
+and suppose
+$\|F\|_{\bar\P,2} < \infty$.
+For $f \in \cF$
+define the empirical process
+%
+\begin{align*}
+G_n(f)
+&=
+\frac{1}{\sqrt n}
+\sum_{i=1}^n
+\big(
+f(X_i) - \E[f(X_i)]
+\big).
+\end{align*}
+%
+Let $\sigma > 0$ satisfy
+$\sup_{f \in \cF}
+\|f\|_{\bar\P,2}
+\leq
+\sigma
+\leq
+\|F\|_{\bar\P,2}$
+and
+$M = \max_{1 \leq i \leq n} F(X_i)$.
+Then with
+$\delta = \sigma / \|F\|_{\bar\P,2} \in (0,1]$,
+%
+\begin{align*}
+\E \left[
+\sup_{f \in \cF}
+\big| G_n(f) \big|
+\right]
+&\lesssim
+\|F\|_{\bar\P,2}
+\, J\big(\delta, \cF, F \big)
++
+\frac{\|M\|_{\P,2} \, J(\delta, \cF, F)^2}{\delta^2 \sqrt{n}},
+\end{align*}
+%
+where $\lesssim$ is up to a universal constant,
+and $J(\delta, \cF, F)$ is the covering integral
+%
+\begin{align*}
+J\big(\delta, \cF, F\big)
+&=
+\int_0^\delta
+\sqrt{1 +
+\sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})}
+\diff{\varepsilon},
+\end{align*}
+%
+with the supremum taken over finite discrete probability
+measures $\Q$ on $(S, \cS)$.
+
+\end{lemma}
+
+\begin{lemma}[A VC class maximal inequality for i.n.i.d.\ empirical processes]
+\label{lem:kernel_app_maximal_vc_inid}
+
+Assume the same setup as in
+Lemma~\ref{lem:kernel_app_maximal_entropy},
+and suppose that $\cF$ forms a VC-type class
+in that
+%
+\begin{align*}
+\sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})
+&\leq
+(C_1/\varepsilon)^{C_2}
+\end{align*}
+%
+for all $\varepsilon \in (0,1]$,
+for some constants
+$C_1 \geq e$
+(where $e$ is the standard exponential constant)
+and $C_2 \geq 1$.
+Then for $\delta \in (0,1]$
+we have the covering integral bound
+%
+$J\big(\delta, \cF, F\big) \leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}$,
+%
+and so by Lemma~\ref{lem:kernel_app_maximal_entropy},
+up to a universal constant,
+%
+\begin{align*}
+\E \left[
+\sup_{f \in \cF}
+\big| G_n(f) \big|
+\right]
+&\lesssim
+\sigma
+\sqrt{C_2 \log (C_1/\delta)}
++
+\frac{\|M\|_{\P,2} C_2 \log(C_1/\delta)}{\sqrt{n}} \\
+&\lesssim
+\sigma
+\sqrt{C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)}
++
+\frac{\|M\|_{\P,2} C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)}
+{\sqrt{n}}.
+\end{align*}
+%
+\end{lemma}
+
+\subsection{Strong approximation results}
+
+Next we provide two strong approximation results.
+The first is a corollary of the KMT approximation
+\citep{komlos1975approximation}
+which applies to bounded-variation functions
+of i.i.d.\ variables.
+The second is an extension of the Yurinskii coupling
+\citep{belloni2019conditional}
+which applies to Lipschitz functions
+of i.n.i.d.\ variables.
+
+\begin{lemma}[A KMT approximation corollary]
+\label{lem:kernel_app_kmt_corollary}
+
+For $n \geq 1$
+let $X_1, \ldots, X_n$
+be i.i.d.\ real-valued random variables and
+$g_n: \R \times \R \to \R$
+be a function satisfying
+the total variation bound
+$\sup_{x \in \R} \|g_n(\cdot, x)\|_\TV < \infty$.
+Then on some probability space
+there exist independent copies of
+$X_1, \ldots, X_n$,
+denoted
+$X_1', \ldots, X_n'$,
+and a mean-zero Gaussian process $Z_n(x)$
+such that if we define
+the empirical process
+%
+\begin{align*}
+G_n(x)
+= \frac{1}{\sqrt n} \sum_{i=1}^n
+\Big(g_n(X_i',x) - \E\big[g_n(X_i',x)\big]\Big),
+\end{align*}
+%
+then
+for some universal positive constants
+$C_1$, $C_2$, and $C_3$,
+%
+\begin{align*}
+\P\left(
+\sup_{x \in \R}
+\big|G_n(x) - Z_n(x)\big|
+> \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV
+\, \frac{t + C_1 \log n}{\sqrt n}
+\right)
+\leq C_2 e^{-C_3 t}.
+\end{align*}
+%
+Further, $Z_n$
+has the same covariance structure as $G_n$
+in the sense that for all $x,\, x' \in \R$,
+%
+\begin{align*}
+\E\big[Z_n(x) Z_n(x')\big]
+= \E\big[G_n(x) G_n(x')\big].
+\end{align*}
+%
+By independently sampling from the law of
+$Z_n$ conditional on $X_1', \ldots, X_n'$,
+we can assume that
+$Z_n$ is a function only of $X_1', \ldots, X_n'$
+and some independent random noise.
+
+\end{lemma}
+
+\begin{lemma}[Yurinskii coupling for Lipschitz i.n.i.d.\ empirical processes]
+\label{lem:kernel_app_yurinskii_corollary}
+
+For $n \geq 2$ let $X_1, \dots, X_n$
+be independent but not necessarily identically distributed
+(i.n.i.d.) random variables
+taking values in a measurable space $(S, \cS)$
+and let $\cX_n \subseteq \R$
+be a compact interval
+with $\left|\log \Leb(\cX_n)\right| \leq C_1 \log n$
+where $C_1 > 0$ is a constant.
+Let $g_n$ be measurable on $S \times \cX_n$ satisfying
+$\sup_{\xi \in S} \sup_{x \in \cX_n} |g_n(\xi, x)| \leq M_n$
+and
+$\sup_{x \in \cX_n} \max_{1 \leq i \leq n} \Var[g_n(X_i, x)]
+\leq \sigma_n^2$,
+with $\left|\log M_n\right| \leq C_1 \log n$
+and $\left|\log \sigma_n^2\right| \leq C_1 \log n$.
+Suppose that $g_n$ satisfies the following uniform
+Lipschitz condition:
+%
+\begin{align*}
+\sup_{\xi \in S}
+\sup_{x,x' \in \cX_n}
+\left|
+\frac{g_n(\xi, x) - g_n(\xi, x')}
+{x-x'}
+\right|
+\leq
+l_{n,\infty},
+\end{align*}
+%
+and also the following $L^2$
+Lipschitz condition:
+%
+\begin{align*}
+\sup_{x,x' \in \cX_n}
+\E\left[
+\frac{1}{n}
+\sum_{i=1}^n
+\left|
+\frac{g_n(X_i, x) - g_n(X_i, x')}
+{x-x'}
+\right|^2
+\right]^{1/2}
+\leq
+l_{n,2},
+\end{align*}
+%
+where $0 < l_{n,2} \leq l_{n,\infty}$,
+$\left|\log l_{n,2}\right| \leq C_1 \log n$, and
+$\left|\log l_{n,\infty}\right| \leq C_1 \log n$.
+Then for any $t_n > 0$ with
+$\left|\log t_n\right| \leq C_1 \log n$,
+there is a probability space carrying
+independent copies of $X_1, \ldots, X_n$ denoted $X_1', \ldots, X_n'$
+and a mean-zero Gaussian process $Z_n(x)$
+such that if we define the empirical process
+%
+$G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n
+\big( g_n(X'_i,x) - \E[g_n(X'_i,x)] \big)$,
+%
+then
+%
+\begin{align*}
+&\P\left(
+\sup_{x \in \cX_n}
+\big|
+G_n(x) - Z_n(x)
+\big|
+> t_n
+\right) \\
+&\quad\leq
+\frac{
+C_2
+\sigma_n
+\sqrt{\Leb(\cX_n)}
+\sqrt{\log n}
+\sqrt{M_n + \sigma_n\sqrt{\log n}}
+}{n^{1/4} t_n^2}
+\sqrt{
+l_{n,2}
+\sqrt{\log n}
++ \frac{l_{n,\infty}}{\sqrt n}
+\log n}
+\end{align*}
+%
+where $C_2 > 0$ is a constant depending only on $C_1$.
+Further, $Z_n$
+has the same covariance structure as $G_n$
+in the sense that for all $x, x' \in \cX_n$,
+%
+\begin{align*}
+\E\big[Z_n(x) Z_n(x')\big]
+= \E\big[G_n(x) G_n(x')\big].
+\end{align*}
+
+\end{lemma}
+
+\subsection{The Vorob'ev--Berkes--Philipp theorem}
+
+We present a generalization of the Vorob'ev--Berkes--Philipp theorem
+\citep{dudley1999uniform}
+which allows one to ``glue'' multiple random variables
+or stochastic processes onto the same probability space,
+while preserving some pairwise distributions.
+We begin with some definitions.
+
+\begin{definition}[Tree]
+A \emph{tree} is a finite undirected graph which is connected and contains no
+cycles or self-loops.
+\end{definition}
+
+\begin{definition}[Polish Borel probability space]
+A \emph{Polish Borel probability space}
+is a triple $(\cX, \cF, \P)$,
+where $\cX$ is a Polish space
+(a topological space metrizable by a complete separable metric),
+$\cF$ is the Borel $\sigma$-algebra induced on $\cX$ by its topology,
+and $\P$ is a probability measure on $(\cX, \cF)$.
+Important examples of Polish spaces include $\R^d$ and
+the Skorokhod space $\cD[0,1]^d$ for $d \geq 1$.
+In particular,
+one can consider vectors of real-valued random variables
+or stochastic processes indexed by
+compact subsets of $\R^d$ which have
+almost surely continuous trajectories.
+\end{definition}
+
+\begin{definition}[Projection of a law]
+Let $(\cX_1, \cF_1)$ and $(\cX_2, \cF_2)$
+be measurable spaces, and
+let $\P_{12}$ be a law on the
+product space
+$(\cX_1 \times \cX_2, \cF_1 \otimes \cF_2)$.
+The \emph{projection} of $\P_{12}$
+onto $\cX_1$ is the law
+$\P_1$ defined on $(\cX_1, \cF_1)$
+by $\P_1 = \P_{12} \circ \pi_1^{-1}$
+where $\pi_1(x_1, x_2) = x_1$
+is the first-coordinate projection.
+\end{definition}
+
+\begin{lemma}[Vorob'ev--Berkes--Philipp theorem, tree form]
+\label{lem:kernel_app_vbp}
+
+Let $\cT$ be a tree with vertex set $\cV = \{1, \ldots, n\}$
+and edge set $\cE$.
+Suppose that attached to each vertex $i$ is a
+Polish Borel probability space
+$(\cX_i, \cF_i, \P_i)$.
+Suppose that attached to each edge $(i,j) \in \cE$
+(where $i<j$ without loss of generality)
+is a law $\P_{i j}$ on
+$(\cX_i \times \cX_j, \cF_i \otimes \cF_j)$.
+Assume that these laws are pairwise-consistent in the sense that
+the projection of $\P_{i j}$ onto
+$\cX_i$ (resp.\ $\cX_j$) is $\P_i$ (resp.\ $\P_j$)
+for each $(i,j) \in \cE$.
+Then there exists a law $\P$ on
+%
+\begin{align*}
+\left(
+\prod_{i=1}^n \cX_i, \
+\bigotimes_{i=1}^n \cF_i
+\right)
+\end{align*}
+%
+such that the projection of $\P$
+onto $\cX_i \times \cX_j$
+is $\P_{i j}$ for each $(i,j) \in \cE$,
+and therefore also the projection of $\P$
+onto $\cX_i$ is $\P_i$ for each $i \in \cV$.
+
+\end{lemma}
+
+\begin{remark}
+The requirement that $\cT$ must contain no cycles
+is necessary in general.
+To see this, consider the Polish Borel probability spaces
+given by
+$\cX_1 = \cX_2 = \cX_3 = \{0,1\}$,
+their respective Borel $\sigma$-algebras,
+and the pairwise-consistent probability measures:
+%
+\begin{align*}
+1/2
+&=
+\P_1(0) = \P_2(0) = \P_3(0) \\
+1/2
+&=
+\P_{12}(0,1) = \P_{12}(1,0) =
+\P_{13}(0,1) = \P_{13}(1,0) =
+\P_{23}(0,1) = \P_{23}(1,0).
+\end{align*}
+%
+Each measure $\P_i$ places equal mass on 0 and 1, while $\P_{i j}$
+asserts that each pair of realizations is a.s.\ not equal.
+The graph of these laws forms a triangle,
+which is not a tree.
+Suppose that $(X_1,X_2,X_3)$ has distribution given by $\P$,
+where $X_i \sim \P_i$ and $(X_i,X_j) \sim \P_{i j}$
+for each $i,j$.
+But then by definition of $\P_{i j}$ we have
+$X_1 = 1-X_2 = X_3 = 1-X_1$ a.s.,
+which is a contradiction.
+
+\end{remark}
+
+\begin{remark}
+
+Two important applications of
+Lemma~\ref{lem:kernel_app_vbp} include
+the embedding of a random vector into a stochastic process
+and the coupling of stochastic processes
+onto the same probability space:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+Let $X_1$ and $X_2$
+be stochastic processes with
+trajectories in $\cD[0,1]$.
+For $x_1, \ldots, x_n \in [0,1]$
+let $\tilde X_1 = (X_1(x_1), \ldots, X_1(x_n))$
+be a random vector
+and suppose that $\tilde X_1'$
+is a copy of $\tilde X_1$.
+Then there is a law $\P$ on
+$\cD[0,1] \times \R^n \times \cD[0,1]$
+such that restriction of $\P$ to
+$\cD[0,1] \times \R^n$
+is the law of $(X_1, \tilde X_1)$,
+while the restriction of $\P$ to
+$\R^n \times \cD[0,1]$
+is the law of $(\tilde X_1',X_2)$.
+In other words,
+we can embed the vector $\tilde X_1'$
+into a stochastic process $X_1$
+while maintaining the joint distribution of
+$\tilde X_1'$ and $X_2$.
+
+\item
+Let $X_1, X_1', \ldots, X_n, X_n'$
+be stochastic processes with
+trajectories in $\cD[0,1]$,
+where $X_i'$ is a copy of $X_i$
+for each $1 \leq i \leq n-1$.
+Suppose that
+$\P\big(\|X_{i+1} - X_i'\| > t)\leq r_i$
+for each $1 \leq i \leq n-1$,
+where $\|\cdot\|$ is a norm on $\cD[0,1]$.
+Then there exist copies of
+$X_1, \ldots, X_n$
+denoted
+$X_1'', \ldots, X_n''$
+satisfying
+$\P\big(\|X_{i+1}'' - X_i''\| > t)\leq r_i$
+for each $1 \leq i \leq n$.
+That is, all of the inequalities
+can be satisfied simultaneously
+on the same probability space.
+
+\end{enumerate}
+
+\end{remark}
+
+\section{Proofs}
+\label{sec:kernel_app_proofs}
+
+We present full proofs of all the results stated in
+Chapter~\ref{ch:kernel} and Appendix~\ref{app:kernel}.
+
+\subsection{Preliminary lemmas}
+
+In this section we list some results
+in probability and U-statistic theory
+which are used in proofs of our main results.
+Other auxiliary lemmas will be introduced when
+they are needed.
+
+\begin{lemma}[Bernstein's inequality for independent random variables]
+\label{lem:kernel_app_bernstein}
+
+Let $X_1, \ldots, X_n$ be independent real-valued
+random variables with
+$\E[X_i] = 0$, $|X_i| \leq M$, and
+$\E[X_i^2] \leq \sigma^2$,
+where $M$ and $\sigma$ are non-random.
+Then for all $t>0$,
+%
+\begin{align*}
+\P \left(
+\left| \frac{1}{n} \sum_{i=1}^n X_i \right| \geq t
+\right)
+\leq 2 \exp \left( -
+\frac{t^2 n}
+{2 \sigma^2 + \frac{2}{3} M t}
+\right).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_bernstein}]
+
+See for example
+Lemma~2.2.9 in~\citet{van1996weak}.
+\end{proof}
+
+\begin{lemma}[The matrix Bernstein inequality]
+\label{lem:kernel_app_matrix_bernstein}
+
+For $1 \leq i \leq n$
+let $X_i$ be independent symmetric $d \times d$
+real random matrices
+with expected values $\mu_i = \E[X_i]$.
+Suppose that
+$\|X_i - \mu_i\|_2 \leq M$
+almost surely for all $1 \leq i \leq n$
+where $M$ is non-random, and define
+$\sigma^2 = \big\| \sum_i \E[(X_i - \mu_i)^2] \big\|_2$.
+Then there exists a universal constant $C > 0$
+such that
+for any $t > 0$ and $q \geq 1$,
+%
+\begin{align*}
+\P\left(
+\left\|
+\sum_{i=1}^n
+\left(
+X_i - \mu_i
+\right)
+\right\|_2
+\geq
+2 \sigma \sqrt{t}
++ \frac{4}{3} M t
+\right)
+&\leq
+2 d e^{-t}, \\
+\E\left[
+\left\|
+\sum_{i=1}^n
+\left(
+X_i - \mu_i
+\right)
+\right\|_2^q
+\right]^{1/q}
+&\leq
+C \sigma \sqrt{q + \log 2d}
++ C M (q + \log 2d).
+\end{align*}
+%
+Another simplified version of this is as follows:
+suppose that
+$\|X_i\|_2 \leq M$ almost surely,
+so that
+$\|X_i - \mu_i\|_2 \leq 2M$.
+Then since
+$\sigma^2 \leq n M^2$,
+we have
+%
+\begin{align*}
+\P\left(
+\left\|
+\sum_{i=1}^n
+\left(
+X_i - \mu_i
+\right)
+\right\|_2
+\geq
+4M \big(t + \sqrt{n t}\big)
+\right)
+&\leq
+2 d e^{-t}, \\
+\E\left[
+\left\|
+\sum_{i=1}^n
+\left(
+X_i - \mu_i
+\right)
+\right\|_2^q
+\right]^{1/q}
+&\leq
+C M
+\big(q + \log 2d + \sqrt{n(q + \log 2d)}\big).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_matrix_bernstein}]
+
+See Lemma~3.2 in \citet{minsker2019moment}.
+\end{proof}
+
+\begin{lemma}[A maximal inequality for Gaussian vectors]
+\label{lem:kernel_app_gaussian_vector_maximal}
+
+Take $n \geq 2$.
+Let $X_i \sim \cN(0, \sigma_i^2)$
+for $1 \leq i \leq n$
+with $\sigma_i^2 \leq \sigma^2$.
+Then
+%
+\begin{align}
+\label{eq:kernel_app_gaussian_vector_maximal}
+\E\left[
+\max_{1 \leq i \leq n}
+X_i
+\right]
+&\leq
+\sigma \sqrt{2 \log n}, \\
+\label{eq:kernel_app_gaussian_vector_maximal_abs}
+\E\left[
+\max_{1 \leq i \leq n}
+|X_i|
+\right]
+&\leq
+2 \sigma \sqrt{\log n}.
+\end{align}
+%
+If $\Sigma_1$ and $\Sigma_2$ are constant
+positive semi-definite $n \times n$ matrices
+and $N \sim \cN(0,I_n)$,
+then
+%
+\begin{align}
+\label{eq:kernel_app_gaussian_difference_psd}
+\E\Big[
+\big\|
+\Sigma_1^{1/2} N
+- \Sigma_2^{1/2} N
+\big\|_\infty
+\Big]
+&\leq
+2 \sqrt{\log n} \,
+\big\|
+\Sigma_1 - \Sigma_2
+\big\|_2^{1/2}.
+\end{align}
+%
+If further $\Sigma_1$ is
+positive definite,
+then
+%
+\begin{align}
+\label{eq:kernel_app_gaussian_difference_pd}
+\E\Big[
+\big\|
+\Sigma_1^{1/2} N
+- \Sigma_2^{1/2} N
+\big\|_\infty
+\Big]
+&\leq
+\sqrt{\log n} \,
+\lambda_{\min}(\Sigma_1)^{-1/2} \,
+\big\|
+\Sigma_1 - \Sigma_2
+\big\|_2.
+\end{align}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}]
+
+For $t > 0$,
+Jensen's inequality on the concave logarithm function
+gives
+%
+\begin{align*}
+\E\left[
+\max_{1 \leq i \leq n}
+X_i
+\right]
+&=
+\frac{1}{t}
+\E\left[
+\log
+\exp
+\max_{1 \leq i \leq n}
+t X_i
+\right]
+\leq
+\frac{1}{t}
+\log
+\E\left[
+\exp
+\max_{1 \leq i \leq n}
+t X_i
+\right]
+\leq
+\frac{1}{t}
+\log
+\sum_{i=1}^n
+\E\left[
+\exp
+t X_i
+\right] \\
+&=
+\frac{1}{t}
+\log
+\sum_{i=1}^n
+\exp
+\left(
+\frac{t^2 \sigma_i^2}{2}
+\right)
+\leq
+\frac{1}{t}
+\log n
++ \frac{t \sigma^2}{2},
+\end{align*}
+%
+by the Gaussian moment generating function.
+Minimizing with $t = \sqrt{2 \log n} / \sigma$
+yields \eqref{eq:kernel_app_gaussian_vector_maximal}.
+For \eqref{eq:kernel_app_gaussian_vector_maximal_abs},
+we use the symmetry of the Gaussian distribution:
+%
+\begin{align*}
+\E\left[
+\max_{1 \leq i \leq n}
+|X_i|
+\right]
+&=
+\E\left[
+\max_{1 \leq i \leq n}
+\{X_i, -X_i\}
+\right]
+\leq
+\sigma \sqrt{2 \log 2n}
+\leq
+2 \sigma \sqrt{\log n}.
+\end{align*}
+%
+For \eqref{eq:kernel_app_gaussian_difference_psd}
+and \eqref{eq:kernel_app_gaussian_difference_pd},
+note that
+$\Sigma_1^{1/2} N - \Sigma_2^{1/2} N$
+is Gaussian with covariance matrix
+$\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$.
+The variances of of its components are the diagonal
+elements of this matrix, namely
+%
+\begin{align*}
+\sigma_i^2
+&=
+\Var\big[
+\big(\Sigma_1^{1/2} N - \Sigma_2^{1/2} N\big)_i
+\big]
+=
+\Big(\big(
+\Sigma_1^{1/2} - \Sigma_2^{1/2}
+\big)^2\Big)_{ii}.
+\end{align*}
+%
+Note that if $e_i$ is the
+$i$th standard unit basis vector,
+then for any real symmetric matrix $A$,
+we have
+$e_i^\T A^2 e_i = (A^2)_{ii}$,
+so in particular
+$(A^2)_{ii} \leq \|A\|_2^2$.
+Therefore
+%
+\begin{align*}
+\sigma_i^2
+&\leq
+\big\|
+\Sigma_1^{1/2} - \Sigma_2^{1/2}
+\big\|_2^2
+=\vcentcolon
+\sigma^2.
+\end{align*}
+%
+Applying
+\eqref{eq:kernel_app_gaussian_vector_maximal_abs}
+then gives
+%
+\begin{align*}
+\E\Big[
+\big\|
+\Sigma_1^{1/2} N
+- \Sigma_2^{1/2} N
+\big\|_\infty
+\Big]
+&\leq
+2 \sqrt{\log n} \,
+\big\|
+\Sigma_1^{1/2} - \Sigma_2^{1/2}
+\big\|_2.
+\end{align*}
+%
+By Theorem~X.1.1
+in \citet{bhatia1997matrix},
+we can deduce
+%
+\begin{align*}
+\big\|
+\Sigma_1^{1/2} - \Sigma_2^{1/2}
+\big\|_2
+&\leq
+\big\|
+\Sigma_1 - \Sigma_2
+\big\|_2^{1/2},
+\end{align*}
+%
+giving
+\eqref{eq:kernel_app_gaussian_difference_psd}.
+If $\Sigma_1$
+is positive definite,
+Theorem~X.3.8 in
+\citet{bhatia1997matrix} gives
+\eqref{eq:kernel_app_gaussian_difference_pd}:
+%
+\begin{align*}
+\big\|
+\Sigma_1^{1/2} - \Sigma_2^{1/2}
+\big\|_2
+&\leq
+\frac{1}{2}
+\lambda_{\min}(\Sigma_1)^{-1/2} \,
+\big\|
+\Sigma_1 - \Sigma_2
+\big\|_2.
+\end{align*}
+%
+\end{proof}
+
+\begin{lemma}[Maximal inequalities for Gaussian processes]
+\label{lem:kernel_app_gaussian_process_maximal}
+
+Let $Z$ be a separable
+mean-zero Gaussian process indexed
+by $x \in \cX$.
+Recall that $Z$ is separable for example if
+$\cX$ is Polish and $Z$ has
+continuous trajectories.
+Define its covariance structure on $\cX \times \cX$
+by $\Sigma(x, x') = \E[Z(x) Z(x')]$,
+and the corresponding semimetric on $\cX$ by
+%
+\begin{align*}
+\rho(x,x')
+&=
+\E\big[\big(Z(x) - Z(x')\big)^2\big]^{1/2}
+= \big(\Sigma(x,x)
+- 2 \Sigma(x,x')
++ \Sigma(x',x')\big)^{1/2}.
+\end{align*}
+%
+Let $N(\varepsilon, \cX, \rho)$
+denote the $\varepsilon$-covering number of $\cX$
+with respect to the semimetric $\rho$.
+Define $\sigma = \sup_x \Sigma(x,x)^{1/2}$.
+Then there exists a universal constant $C > 0$
+such that for any $\delta > 0$,
+%
+\begin{align*}
+\E\left[
+\sup_{x \in \cX}
+|Z(x)|
+\right]
+&\leq
+C \sigma
++ C \int_0^{2\sigma}
+\sqrt{\log N(\varepsilon, \cX, \rho)}
+\diff{\varepsilon}, \\
+\E\left[
+\sup_{\rho(x,x') \leq \delta}
+|Z(x) - Z(x')|
+\right]
+&\leq
+C \int_0^{\delta}
+\sqrt{\log N(\varepsilon, \cX, \rho)}
+\diff{\varepsilon}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_process_maximal}]
+
+See Corollary~2.2.8 in \citet{van1996weak},
+noting that for any $x,x' \in \cX$, we have
+$\E[|Z(x)|] \lesssim \sigma$ and
+$\rho(x,x') \leq 2\sigma$,
+implying that
+$\log N(\varepsilon, \cX, \rho) = 0$
+for all
+$\varepsilon > 2 \sigma$.
+\end{proof}
+
+\begin{lemma}[Anti-concentration for Gaussian process absolute suprema]
+\label{lem:kernel_app_anticoncentration}
+
+Let $Z$ be a separable mean-zero Gaussian process
+indexed by a semimetric space $\cX$ with
+$\E[Z(x)^2] = 1$
+for all $x \in \cX$.
+Then for any $\varepsilon > 0$,
+%
+\begin{align*}
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{x \in \cX}
+\big| Z(x) \big|
+- t
+\right|
+\leq \varepsilon
+\right)
+&\leq
+4 \varepsilon
+\left(
+1 + \E\left[
+\sup_{x \in \cX}
+\big| Z(x) \big|
+\right]
+\right).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_anticoncentration}]
+
+See Corollary~2.1
+in \citet{chernozhukov2014anti}.
+\end{proof}
+
+\begin{lemma}[No slowest rate of convergence in probability]
+\label{lem:kernel_app_slow_convergence}
+
+Let $X_n$ be a sequence of real-valued random
+variables with
+$X_n = o_\P(1)$.
+Then there exists a deterministic sequence
+$\varepsilon_n \to 0$
+such that
+$\P\big(|X_n| > \varepsilon_n\big) \leq \varepsilon_n$
+for all $n \geq 1$.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_slow_convergence}]
+
+Define the following deterministic sequence
+for $k \geq 1$.
+%
+\begin{align*}
+\tau_k
+&=
+\sup
+\big\{
+n \geq 1:
+\P\big(|X_n| > 1/k\big)
+> 1/k
+\big\}
+\vee
+(\tau_{k-1} +1)
+\end{align*}
+%
+with $\tau_0 = 0$.
+Since $X_n = o_\P(1)$,
+each $\tau_k$ is finite
+and so we can define
+$\varepsilon_n = \frac{1}{k}$
+where $\tau_k < n \leq \tau_{k+1}$.
+Then, noting that $\varepsilon_n \to 0$,
+we have
+$\P\big(|X_n| > \varepsilon_n\big)
+= \P\big(|X_n| > 1/k\big) \leq 1/k = \varepsilon_n$.
+\end{proof}
+
+\begin{lemma}[General second-order Hoeffding-type decomposition]
+\label{lem:kernel_app_general_hoeffding}
+
+Let $\cU$ be a vector space.
+Let $u_{i j} \in \cU$ be defined for
+$1 \leq i, j \leq n$
+and
+$i \neq j$.
+Suppose that $u_{i j} = u_{j i}$
+for all $i,j$.
+Then for any $u_i \in \cU$
+(for $1 \leq i \leq n$)
+and any $u \in \cU$,
+the following decomposition holds:
+%
+\begin{align*}
+\sum_{i=1}^n
+\sum_{\substack{j=1 \\ j \neq i}}^n
+\big(u_{i j} - u\big)
+&=
+2(n-1)
+\sum_{i=1}^n
+\big(u_i - u\big)
++
+\sum_{i=1}^n
+\sum_{\substack{j=1 \\ j \neq i}}^n
+\big(u_{i j} - u_i - u_j + u\big).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_general_hoeffding}]
+
+We compute the left hand side minus the right hand side,
+beginning by observing that all of the
+$u_{i j}$ and $u$ terms clearly cancel.
+%
+\begin{align*}
+&\sum_{i=1}^n
+\sum_{j \neq i}^n
+\big(u_{i j} - u\big)
+- 2(n-1)
+\sum_{i=1}^n
+\big(u_i - u\big)
+-
+\sum_{i=1}^n
+\sum_{j \neq i}
+\big(u_{i j} - u_i - u_j + u\big) \\
+&\qquad=
+- 2(n-1)
+\sum_{i=1}^n
+u_i
+-
+\sum_{i=1}^n
+\sum_{j \neq i}^n
+\big(- u_i - u_j\big)
+=
+- 2(n-1)
+\sum_{i=1}^n
+u_i
++
+\sum_{i=1}^n
+\sum_{j \neq i}^n
+u_i
++
+\sum_{j=1}^n
+\sum_{i \neq j}^n
+u_j \\
+&\qquad=
+- 2(n-1)
+\sum_{i=1}^n
+u_i
++
+(n-1)
+\sum_{i=1}^n
+u_i
++
+(n-1)
+\sum_{j=1}^n
+u_j
+= 0.
+\end{align*}
+\end{proof}
+
+\begin{lemma}[A U-statistic concentration inequality]
+\label{lem:kernel_app_ustat_concentration}
+
+Let $(S,\cS)$ be a measurable space and
+$X_1, \ldots, X_n$ be i.i.d.\ $S$-valued random variables.
+Let $H: S^m \to \R$ be a function of $m$ variables
+satisfying the symmetry property
+$H(x_1, \ldots, x_m) = H(x_{\tau (1)}, \ldots, x_{\tau (m)})$
+for any $m$-permutation $\tau$.
+Suppose also that
+$\E[H(X_1, \ldots, X_m)] = 0$.
+Let
+$M = \|H\|_\infty$
+and
+$\sigma^2 = \E\big[\E[H(X_1, \ldots, X_m) \mid X_1]^2\big]$.
+Define the U-statistic
+%
+\begin{align*}
+U_n
+&=
+\frac{m!(n-m)!}{n!}
+\sum_{1 \leq i_1 < \cdots < i_m \leq n}
+H(X_1, \ldots, X_n).
+\end{align*}
+%
+Then for any $t > 0$,
+with $C_1(m)$, $C_2(m)$
+positive constants depending only on $m$,
+%
+\begin{align*}
+\P\left(
+|U_n| > t
+\right)
+&\leq
+4 \exp \left(
+- \frac{n t^2}{C_1(m) \sigma^2 + C_2(m) M t}
+\right).
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_ustat_concentration}]
+See Theorem~2 in \citet{arcones1995bernstein}.
+\end{proof}
+
+\begin{lemma}[A second-order U-process maximal inequality]
+\label{lem:kernel_app_uprocess_maximal}
+
+Let $X_1 \ldots, X_n$
+be i.i.d.\ random variables taking values
+in a measurable space $(S, \cS)$
+with distribution $\P$.
+Let $\cF$ be a class of measurable functions from
+$S \times S$ to $\R$ which is also pointwise measurable.
+Define the degenerate second-order U-process
+%
+\begin{align*}
+U_n(f)
+=
+\frac{2}{n(n-1)}
+\sum_{i<j}
+&\Big(
+f(X_i, X_j)
+- \E\big[f(X_i,X_j) \mid X_i\big]
+-
+\E\big[f(X_i,X_j) \mid X_j\big]
++ \E\big[f(X_i,X_j)\big]
+\Big)
+\end{align*}
+%
+for $f \in \cF$.
+Suppose that each $f \in \cF$ is symmetric in the sense that
+$f(s_1,s_2) = f(s_2,s_1)$
+for all $s_1, s_2 \in S$.
+Let $F$ be a measurable envelope function for $\cF$
+satisfying $|f(s_1,s_2)| \leq F(s_1,s_2)$
+for all $s_1,s_2 \in S$.
+For a law $\Q$ on
+$(S \times S, \, \cS \otimes \cS)$,
+define the $(\Q,q)$-norm of $f \in \cF$ by
+$\|f\|_{\Q,q}^q = \E_\Q[|f|^q]$.
+Assume that $\cF$ is VC-type in the following manner.
+%
+\begin{align*}
+\sup_\Q
+N\big(
+\cF, \|\cdot\|_{\Q,2}, \varepsilon \|F\|_{\Q,2}
+\big)
+&\leq
+(C_1/\varepsilon)^{C_2}
+\end{align*}
+%
+for some constants
+$C_1 \geq e$
+and
+$C_2 \geq 1$,
+and for all $\varepsilon \in (0,1]$,
+where $\Q$ ranges over all finite discrete laws
+on
+$S \times S$.
+Let $\sigma > 0$ be any deterministic value satisfying
+$\sup_{f \in \cF} \|f\|_{\P,2} \leq \sigma \leq \|F\|_{\P,2}$,
+and define the random variable $M = \max_{i,j} |F(X_i, X_j)|$.
+Then there exists a universal constant $C_3 > 0$
+satisfying
+%
+\begin{align*}
+n
+\E\left[
+\sup_{f \in \cF}
+\big| U_n(f) \big|
+\right]
+&\leq
+C_3 \sigma
+\Big(
+C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big)
+\Big)
++ \frac{C_3 \|M\|_{\P,2}}{\sqrt{n}}
+\Big(
+C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big)
+\Big)^2.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_uprocess_maximal}]
+
+Apply Corollary~5.3
+from \citet{chen2020jackknife}
+with the order of the U-statistic fixed at
+$r=2$,
+and with $k=2$.
+\end{proof}
+
+\begin{lemma}[A U-statistic matrix concentration inequality]
+\label{lem:kernel_app_ustat_matrix_concentration}
+
+Let $X_1, \ldots, X_n$ be i.i.d.\ random variables
+taking values in a measurable space $(S, \cS)$.
+Suppose
+$H: S^2 \to \R^{d \times d}$
+is a measurable matrix-valued function
+of two variables
+satisfying the following:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+$H(X_1, X_2)$ is an almost surely symmetric matrix.
+
+\item
+$\|H(X_1, X_2)\|_2 \leq M$ almost surely.
+
+\item
+$H$ is a symmetric function in its arguments in that
+$H(X_1, X_2) = H(X_2, X_1)$.
+
+\item
+$H$ is degenerate in the sense that
+$\E[H(X_1, x_2)] = 0$ for all $x_2 \in S$.
+
+\end{enumerate}
+%
+Let $U_n = \sum_i \sum_{j \neq i} H(X_i, X_j)$
+be a U-statistic,
+and define the variance-type constant
+%
+\begin{align*}
+\sigma^2
+&=
+\E\left[
+\left\|
+\E\left[
+H(X_i, X_j)^2
+\mid X_j
+\right]
+\right\|_2
+\right].
+\end{align*}
+%
+Then for a universal constant $C > 0$
+and for all $t > 0$,
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2
+\geq
+C \sigma n (t + \log d)
++ C M \sqrt{n} (t + \log d)^{3/2}
+\right)
+&\leq
+C e^{-t}.
+\end{align*}
+%
+By Jensen's inequality,
+$\sigma^2 \leq \E[ \| H(X_i, X_j)^2 \|_2 ]
+= \E[ \| H(X_i, X_j) \|_2^2 ] \leq M^2$, giving the simpler
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2
+\geq
+2 C M n
+(t + \log d)^{3/2}
+\right)
+&\leq
+C e^{-t}.
+\end{align*}
+%
+From this last inequality we deduce a moment bound
+by integration of tail probabilities:
+%
+\begin{align*}
+\E\left[
+\|U_n\|_2
+\right]
+&\lesssim
+M n (\log d)^{3/2}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}]
+
+We apply results from \citet{minsker2019moment}.
+
+\proofparagraph{decoupling}
+
+Let $\bar U_n = \sum_{i=1}^n \sum_{j=1}^n H(X_i^{(1)}, X_j^{(2)})$
+be a decoupled matrix U-statistic,
+where $X^{(1)}$ and $X^{(2)}$
+are i.i.d.\ copies of the sequence $X_1, \ldots, X_n$.
+By Lemma~5.2 in \citet{minsker2019moment},
+since we are only stating this result for
+degenerate U-statistics of order 2,
+there exists a universal constant $D_2$
+such that for any $t > 0$,
+we have
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2 \geq t
+\right)
+&\leq
+D_2
+\P\left(
+\|\bar U_n\|_2 \geq t / D_2
+\right).
+\end{align*}
+
+\proofparagraph{concentration of the decoupled U-statistic}
+
+By Equation~11 in \citet{minsker2019moment},
+we have the following concentration inequality
+for decoupled degenerate U-statistics.
+For some universal constant $C_1$
+and for any $t > 0$,
+%
+\begin{align*}
+\P\left(
+\|\bar U_n\|_2
+\geq
+C_1 \sigma n (t + \log d)
++ C_1 M \sqrt{n} (t + \log d)^{3/2}
+\right)
+&\leq
+e^{-t}.
+\end{align*}
+
+\proofparagraph{concentration of the original U-statistic}
+
+Hence we have
+%
+\begin{align*}
+&\P\left(
+\|U_n\|_2
+\geq
+C_1 D_2 \sigma n (t + \log d)
++ C_1 D_2 M \sqrt{n} (t + \log d)^{3/2}
+\right) \\
+&\quad\leq
+D_2 \P\left(
+\|\bar U_n\|_2
+\geq
+C_1 \sigma n (t + \log d)
++ C_1 M \sqrt{n} (t + \log d)^{3/2}
+\right)
+\leq
+D_2 e^{-t}.
+\end{align*}
+%
+The main result follows by setting
+$C = C_1 + C_1 D_2$.
+
+\proofparagraph{moment bound}
+
+We now obtain a moment bound for the simplified version.
+We already have that
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2
+\geq
+2 C M n
+(t + \log d)^{3/2}
+\right)
+&\leq
+C e^{-t}.
+\end{align*}
+%
+This implies that for any $t \geq \log d$,
+we have
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2
+\geq
+8 C M n
+t^{3/2}
+\right)
+&\leq
+C e^{-t}.
+\end{align*}
+%
+Defining
+$s = 8 C M n t^{3/2}$
+so $t = \left( \frac{s}{8C M n} \right)^{2/3}$
+shows that for any $s \geq 8C M n(\log d)^{3/2}$,
+%
+\begin{align*}
+\P\left(
+\|U_n\|_2
+\geq
+s
+\right)
+&\leq
+C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}.
+\end{align*}
+%
+Hence the moment bound is obtained:
+%
+\begin{align*}
+\E\left[
+\|U_n\|_2
+\right]
+&=
+\int_0^\infty
+\P\left(
+\|U_n\|_2
+\geq
+s
+\right)
+\diff{s} \\
+&=
+\int_0^{8C M n(\log d)^{3/2}}
+\P\left(
+\|U_n\|_2
+\geq
+s
+\right)
+\diff{s}
++
+\int_{8C M n(\log d)^{3/2}}^\infty
+\P\left(
+\|U_n\|_2
+\geq
+s
+\right)
+\diff{s} \\
+&\leq
+8C M n(\log d)^{3/2}
++
+\int_0^\infty
+C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}
+\diff{s} \\
+&=
+8C M n(\log d)^{3/2}
++
+8C M n
+\int_0^\infty
+e^{s^{-2/3}}
+\diff{s}
+\lesssim
+Mn(\log d)^{3/2}.
+\end{align*}
+\end{proof}
+
+\subsection{Technical lemmas}
+
+Before presenting the proof of
+Lemma~\ref{lem:kernel_app_maximal_entropy},
+we give some auxiliary lemmas;
+namely a symmetrization inequality
+(Lemma~\ref{lem:kernel_app_symmetrization}),
+a Rademacher contraction principle
+(Lemma~\ref{lem:kernel_app_contraction}),
+and a Hoffman--J{\o}rgensen inequality
+(Lemma~\ref{lem:kernel_app_hoffmann}).
+Recall that the Rademacher distribution
+places probability mass of $1/2$
+on each of the points $-1$ and $1$.
+
+\begin{lemma}[A symmetrization inequality for i.n.i.d.\ variables]
+\label{lem:kernel_app_symmetrization}
+
+Let $(S, \cS)$ be a measurable space and
+$\cF$ a class of Borel-measurable functions
+from $S$ to $\R$ which is pointwise measurable
+(i.e.\ it contains a countable dense subset
+under pointwise convergence).
+Let $X_1, \ldots, X_n$
+be independent
+but not necessarily identically distributed
+$S$-valued random variables.
+Let $a_1, \ldots, a_n$ be arbitrary points in $S$
+and $\phi$ a non-negative non-decreasing convex function
+from $\R$ to $\R$.
+Define $\varepsilon_1, \ldots, \varepsilon_n$
+as independent Rademacher
+random variables,
+independent of $X_1, \ldots, X_n$.
+Then
+%
+\begin{align*}
+\E \left[
+\phi \left(
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\Big(
+f(X_i)
+- \E[f(X_i)]
+\Big)
+\right|
+\right)
+\right]
+&\leq
+\E \left[
+\phi \left(
+2
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+\Big(
+f(X_i)
+- a_i
+\Big)
+\right|
+\right)
+\right].
+\end{align*}
+%
+Note that in particular this holds with $a_i = 0$
+and also holds with $\phi(t) = t \vee 0$.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_symmetrization}]
+
+See Lemma~2.3.6 in
+\citet{van1996weak}.
+%
+\end{proof}
+
+\begin{lemma}[A Rademacher contraction principle]
+\label{lem:kernel_app_contraction}
+
+Let $\varepsilon_1, \ldots, \varepsilon_n$
+be independent Rademacher random variables
+and $\cT$ be a bounded subset of $\R^n$.
+Define
+$M = \sup_{t \in \cT} \max_{1 \leq i \leq n} |t_i|$.
+Then, noting that the supremum is measurable
+because $\cT$ is a subset of a separable metric space
+and is therefore itself separable,
+%
+\begin{align*}
+\E
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i^2
+\right|
+\right]
+&\leq
+4M \,
+\E
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i
+\right|
+\right].
+\end{align*}
+%
+This gives the following corollary.
+Let $X_1, \ldots, X_n$ be mutually independent
+and also independent of $\varepsilon_1, \ldots, \varepsilon_n$.
+Let $\cF$ be a pointwise measurable class of functions
+from a measurable space $(S, \cS)$ to $\R$,
+with measurable envelope $F$.
+Define $M = \max_i F(X_i)$.
+Then we obtain
+%
+\begin{align*}
+\E
+\left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)^2
+\right|
+\right]
+&\leq
+4
+\E
+\left[
+M
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right].
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_contraction}]
+
+Apply Theorem~4.12 from \citet{ledoux1991probability} with $F$ the identity
+function and
+%
+\begin{align*}
+\psi_i(s)
+= \psi(s)
+&=
+\min
+\left(
+\frac{s^2}{2M},
+\frac{M}{2}
+\right).
+\end{align*}
+%
+This is a weak contraction
+(i.e.\ 1-Lipschitz)
+because it is continuous,
+differentiable on $(-M,M)$
+with derivative bounded by
+$|\psi'(s)| \leq |s|/M \leq 1$,
+and constant outside $(-M,M)$.
+Note that since $|t_i| \leq M$
+by definition,
+we have $\psi_i(t_i) = t_i^2 / (2M)$.
+Hence
+by Theorem~4.12
+from \citet{ledoux1991probability},
+%
+\begin{align*}
+\E
+\left[
+F
+\left(
+\frac{1}{2}
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+\psi_i(t_i)
+\right|
+\right)
+\right]
+&\leq
+\E
+\left[
+F
+\left(
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i
+\right|
+\right)
+\right], \\
+\E
+\left[
+\frac{1}{2}
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+\frac{t_i^2}{2M}
+\right|
+\right]
+&\leq
+\E
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i
+\right|
+\right], \\
+\E
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i^2
+\right|
+\right]
+&\leq
+4M \,
+\E
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i
+\right|
+\right].
+\end{align*}
+%
+For the corollary, set
+$\cT = \left\{\big(f(X_1), \ldots, f(X_n)\big) : f \in \cF\right\}$.
+For a fixed realization
+$X_1, \ldots, X_n$,
+%
+\begin{align*}
+\E_\varepsilon
+\left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)^2
+\right|
+\right]
+&=
+\E_\varepsilon
+\left[
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i^2
+\right|
+\right] \\
+&\leq 4
+\E_\varepsilon
+\left[
+M
+\sup_{t \in \cT}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+t_i
+\right|
+\right]
+= 4 \E_\varepsilon
+\left[
+M
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right].
+\end{align*}
+%
+Taking an expectation over $X_1, \ldots, X_n$
+and applying Fubini's theorem yields the result.
+\end{proof}
+
+\begin{lemma}[A Hoffmann--J{\o}rgensen inequality]
+\label{lem:kernel_app_hoffmann}
+
+Let $(S, \cS)$ be a measurable space
+and $X_1, \ldots, X_n$
+be $S$-valued random variables.
+Suppose that
+$\cF$ is a pointwise measurable class of functions from $S$ to $\R$
+with finite envelope $F$.
+Let $\varepsilon_1, \ldots, \varepsilon_n$
+be independent Rademacher variables
+independent of $X_1, \ldots, X_n$.
+For $q \in (1, \infty)$,
+%
+\begin{align*}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+^q
+\right]
+^{1/q}
+&\leq
+C_q
+\left(
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right]
++
+\E \left[
+\max_{1 \leq i \leq n}
+\sup_{f \in \cF}
+\big| f(X_i) \big|^q
+\right]^{1/q}
+\right),
+\end{align*}
+%
+where $C_q$ is a positive constant depending only on $q$.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_hoffmann}]
+
+We use Talagrand's formulation of
+a Hoffmann--J{\o}rgensen inequality.
+Consider the
+independent
+$\ell^\infty(\cF)$-valued
+random functionals $u_i$ defined by
+$u_i(f) = \varepsilon_i f(X_i)$,
+where $\ell^\infty(\cF)$
+is the Banach space of bounded functions from
+$\cF$ to $\R$,
+equipped with the norm
+$\|u\|_\cF = \sup_{f \in \cF} |u(f)|$.
+Then Remark~3.4 in \citet{kwapien1991hypercontraction} gives
+%
+\begin{align*}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+u_i(f)
+\right|
+^q
+\right]
+^{1/q}
+&\leq
+C_q
+\left(
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+u_i(f)
+\right|
+\right]
++
+\E \left[
+\max_{1 \leq i \leq n}
+\sup_{f \in \cF}
+\left|
+u_i(f)
+\right|^q
+\right]^{1/q}
+\right) \\
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+^q
+\right]
+^{1/q}
+&\leq
+C_q
+\left(
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right]
++
+\E \left[
+\max_{1 \leq i \leq n}
+\sup_{f \in \cF}
+\big| f(X_i) \big|^q
+\right]^{1/q}
+\right).
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_maximal_entropy}]
+
+We follow the proof of Theorem~5.2
+from \citet{chernozhukov2014gaussian},
+using our i.n.i.d.\ versions of the symmetrization inequality
+(Lemma~\ref{lem:kernel_app_symmetrization}),
+Rademacher contraction principle
+(Lemma~\ref{lem:kernel_app_contraction}),
+and Hoffmann--J{\o}rgensen inequality
+(Lemma~\ref{lem:kernel_app_hoffmann}).
+
+Without loss of generality,
+we may assume that $J(1, \cF, F) < \infty$
+as otherwise there is nothing to prove,
+and that $F > 0$ everywhere on $S$.
+Let $\P_n = n^{-1} \sum_i \delta_{X_i}$
+be the empirical distribution
+of $X_i$,
+and define the empirical variance bound
+$\sigma_n^2 = \sup_\cF n^{-1} \sum_i f(X_i)^2$.
+By the i.n.i.d.\ symmetrization inequality
+(Lemma~\ref{lem:kernel_app_symmetrization}),
+%
+\begin{align*}
+\E \left[
+\sup_{f \in \cF}
+\big| G_n(f) \big|
+\right]
+&=
+\frac{1}{\sqrt n}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\Big(
+f(X_i)
+- \E[f(X_i)]
+\Big)
+\right|
+\right]
+\leq
+\frac{2}{\sqrt n}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right],
+\end{align*}
+%
+where $\varepsilon_1, \ldots, \varepsilon_n$
+are independent Rademacher random variables,
+independent of $X_1, \ldots, X_n$.
+Then the standard entropy integral inequality
+from the proof of Theorem~5.2 in
+the supplemental materials for
+\citet{chernozhukov2014gaussian}
+gives for a universal constant $C_1 > 0$,
+%
+\begin{align*}
+\frac{1}{\sqrt n}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\Bigm\vert
+X_1, \ldots, X_n
+\right]
+&\leq
+C_1 \|F\|_{\P_n,2}
+\, J(\sigma_n / \|F\|_{\P_n,2}, \cF, F).
+\end{align*}
+%
+Taking marginal expectations
+and applying Jensen's inequality along with
+a convexity result for the covering integral,
+as in Lemma~A.2 in \citet{chernozhukov2014gaussian}, gives
+%
+\begin{align*}
+Z
+&\vcentcolon=
+\frac{1}{\sqrt n}
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right]
+\leq
+C_1 \|F\|_{\bar\P,2}
+\, J(\E[\sigma_n^2]^{1/2} / \|F\|_{\bar\P,2}, \cF, F).
+\end{align*}
+%
+Now use symmetrization
+(Lemma~\ref{lem:kernel_app_symmetrization}),
+the contraction principle
+(Lemma~\ref{lem:kernel_app_contraction}),
+the Cauchy--Schwarz inequality,
+and the Hoffmann--J{\o}rgensen inequality
+(Lemma~\ref{lem:kernel_app_hoffmann})
+to deduce that
+%
+\begin{align*}
+\E[\sigma_n^2]
+&=
+\E\left[
+\sup_{f \in \cF}
+\frac{1}{n}
+\sum_{i=1}^n
+f(X_i)^2
+\right]
+\leq
+\sup_{f \in \cF}
+\E_{\bar\P} \left[
+f(X_i)^2
+\right]
++ \frac{1}{n}
+\E\left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+f(X_i)^2
+- \E \left[
+f(X_i)^2
+\right]
+\right|
+\right] \\
+&\leq
+\sigma^2
++ \frac{2}{n}
+\E\left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)^2
+\right|
+\right]
+\leq
+\sigma^2
++ \frac{8}{n}
+\E\left[
+M
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right] \\
+&\leq
+\sigma^2
++ \frac{8}{n}
+\E\left[
+M^2
+\right]^{1/2}
+\E\left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|^2
+\right]^{1/2} \\
+&\leq
+\sigma^2
++ \frac{8}{n}
+\|M\|_{\P,2} \,
+C_2
+\left(
+\E \left[
+\sup_{f \in \cF}
+\left|
+\sum_{i=1}^n
+\varepsilon_i
+f(X_i)
+\right|
+\right]
++
+\E \left[
+\max_{1 \leq i \leq n}
+\sup_{f \in \cF}
+\big| f(X_i) \big|^2
+\right]^{1/2}
+\right) \\
+&=
+\sigma^2
++ \frac{8C_2}{n}
+\|M\|_{\P,2} \,
+\left(
+\sqrt{n} Z
++
+\|M\|_{\P,2}
+\right)
+\lesssim
+\sigma^2
++
+\frac{\|M\|_{\P,2} Z}{\sqrt n}
++
+\frac{\|M\|_{\P,2}^2}{n},
+\end{align*}
+%
+where $\lesssim$ indicates a bound up to a universal constant.
+Hence taking a square root we see that,
+following the notation from the proof of Theorem~5.2
+in the supplemental materials to
+\citet{chernozhukov2014gaussian},
+%
+\begin{align*}
+\sqrt{\E[\sigma_n^2]}
+&\lesssim
+\sigma
++
+\|M\|_{\P,2}^{1/2} Z^{1/2} n^{-1/4}
++
+\|M\|_{\P,2} n^{-1/2}
+\lesssim
+\|F\|_{\bar\P,2}
+\left( \Delta \vee \sqrt{DZ} \right),
+\end{align*}
+%
+where
+$\Delta^2 = \|F\|_{\bar\P,2}^{-2}
+\big(\sigma^2 \vee (\|M\|_{\P,2}^2 / n) \big) \geq \delta^2$
+and
+$D = \|M\|_{\P,2} n^{-1/2} \|F\|_{\bar\P,2}^{-2}$.
+Thus returning to our bound on $Z$,
+we now have
+%
+\begin{align*}
+Z
+&\lesssim
+\|F\|_{\bar\P,2}
+\, J(\Delta \vee \sqrt{DZ}, \cF, F).
+\end{align*}
+%
+The final steps proceed as
+in the proof of Theorem~5.2
+from \citet{chernozhukov2014gaussian},
+considering cases separately for
+$\Delta \geq \sqrt{DZ}$
+and
+$\Delta < \sqrt{DZ}$,
+and applying convexity properties of
+the entropy integral $J$.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_maximal_vc_inid}]
+
+We assume the VC-type condition
+%
+$\sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) \leq
+(C_1/\varepsilon)^{C_2}$
+%
+for all $\varepsilon \in (0,1]$,
+with constants
+$C_1 \geq e$ and $C_2 \geq 1$.
+Hence for $\delta \in (0,1]$,
+the entropy integral can be bounded as
+%
+\begin{align*}
+J\big(\delta, \cF, F\big)
+&=
+\int_0^\delta
+\sqrt{1 +
+\sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})}
+\diff{\varepsilon}
+\leq
+\int_0^\delta
+\sqrt{1 +
+C_2 \log (C_1/\varepsilon)}
+\diff{\varepsilon} \\
+&\leq
+\int_0^\delta
+\left(
+1 +
+\sqrt{C_2 \log (C_1/\varepsilon)}
+\right)
+\diff{\varepsilon}
+=
+\delta
++ \sqrt{C_2}
+\int_0^\delta
+\sqrt{\log (C_1/\varepsilon)}
+\diff{\varepsilon} \\
+&\leq
+\delta
++ \sqrt{\frac{C_2}{\log (C_1/\delta)}}
+\int_0^\delta
+\log (C_1/\varepsilon)
+\diff{\varepsilon}
+=
+\delta
++ \sqrt{\frac{C_2}{\log (C_1/\delta)}}
+\big(
+\delta
++ \delta \log (C_1/\delta)
+\big) \\
+&\leq
+3 \delta
+\sqrt{C_2 \log (C_1/\delta)}.
+\end{align*}
+%
+The remaining equations now follow
+by Lemma~\ref{lem:kernel_app_maximal_entropy}.
+\end{proof}
+
+Before proving Lemma~\ref{lem:kernel_app_kmt_corollary},
+we give a bounded-variation characterization
+(Lemma~\ref{lem:kernel_app_bv_characterization}).
+
+\begin{lemma}[A characterization of bounded-variation functions]
+\label{lem:kernel_app_bv_characterization}
+
+Let $\cV_1$ be
+the class of real-valued functions on $[0,1]$
+which are 0 at 1 and have total variation bounded by 1.
+Also define the class of
+half-interval indicator functions $\cI = \{\I[0,t]: t \in [0,1]\}$.
+For any topological vector space $\cX$,
+define the symmetric convex hull of a subset $\cY \subseteq \cX$ as
+%
+\begin{align*}
+\symconv \cY
+&=
+\left\{
+\sum_{i=1}^n
+\lambda_i
+y_i :
+\sum_{i=1}^n
+\lambda_i
+= 1, \
+\lambda_i
+\geq 0, \
+y_i \in \cY \cup -\cY, \
+n \in \N
+\right\}.
+\end{align*}
+%
+Denote its closure by $\overline\symconv \ \cY$.
+Under the pointwise convergence topology,
+$\cV_1 \subseteq \overline\symconv \ \cI$.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_bv_characterization}]
+
+Firstly, let $\cD \subseteq \cV_1$
+be the class of real-valued functions
+on $[0,1]$
+which are
+0 at 1,
+have total variation exactly 1,
+and are weakly monotone decreasing.
+Therefore, for $g \in \cD$, we have
+$\|g\|_\TV = g(0) = 1$.
+Let $S = \{s_1, s_2, \dots\} \subseteq [0,1]$
+be the countable set of discontinuity points of $g$.
+We want to find a sequence of
+convex combinations of elements of
+$\cI$ which converges pointwise to $g$.
+To do this, first define the sequence of meshes
+%
+\begin{align*}
+A_n =
+\{s_k : 1 \leq k \leq n\}
+\cup
+\{k/n : 0 \leq k \leq n\},
+\end{align*}
+%
+which satisfies
+$\bigcup_n A_n = S \cup ([0,1] \cap \Q)$.
+Endow $A_n$ with the ordering
+induced by the canonical order on $\R$,
+giving $A_n = \{a_1, a_2, \ldots\}$,
+and define the sequence of functions
+%
+\begin{align*}
+g_n(x)
+= \sum_{k = 1}^{|A_n|-1}
+\I[0,a_k]
+\big( g(a_k) - g(a_{k+1}) \big),
+\end{align*}
+%
+where clearly
+$\I[0, a_k] \in \cI$,
+$g(a_k) - g(a_{k+1}) \geq 0$,
+and
+$\sum_{k = 1}^{|A_n|-1}
+\big(
+g(a_k) - g(a_{k+1})
+\big)
+= g(0) - g(1) = 1$.
+Therefore $g_n$ is a convex combination of elements of $\cI$.
+Further, note that for
+$a_k \in A_n$,
+%
+\begin{align*}
+g_n(a_k)
+= \sum_{j = k}^{|A_n|-1}
+\big( g(a_j) - g(a_{j+1}) \big)
+= g(a_k) - g(a_{|A_n|})
+= g(a_k) - g(1)
+= g(a_k).
+\end{align*}
+%
+Hence if $x \in S$, then eventually $x \in A_n$ so $g_n(x) \to g(x)$.
+Alternatively, if $x \not\in S$, then $g$ is continuous at $x$.
+But $g_n \to g$ on the dense set $\bigcup_n A_n$,
+so also $g_n(x) \to g(x)$.
+Hence $g_n \to g$
+pointwise on $[0,1]$.
+
+Now take $f \in \cV_1$.
+By the Jordan decomposition for
+total variation functions
+\citep{royden1988real},
+we can write
+$f = f^+ - f^-$,
+with
+$f^+$ and $f^-$ weakly decreasing,
+$f^+(1) = f^-(1) = 0$,
+and
+$\|f^+\|_\TV + \|f^-\|_\TV = \|f\|_\TV$.
+Supposing that both
+$\|f^+\|_\TV$ and $\|f^-\|_\TV$
+are strictly positive, let
+$g_n^+$ approximate
+the unit-variation function
+$f^+/\|f^+\|_\TV$
+and
+$g_n^-$ approximate $f^-/\|f^-\|_\TV$
+as above.
+Then since trivially
+%
+\begin{align*}
+f =
+\|f^+\|_\TV f^+ / \|f^+\|_\TV
+- \|f^-\|_\TV f^- / \|f^-\|_\TV
++ \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0,
+\end{align*}
+%
+we have that
+the convex combination
+%
+\begin{align*}
+g_n^+ \|f^+\|_\TV
+- g_n^- \|f^-\|_\TV
++ \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0
+\end{align*}
+%
+converges pointwise to $f$.
+This also holds if either of the total variations
+$\|f^\pm\|_\TV$
+are zero,
+since then the corresponding sequence $g_n^\pm$
+need not be defined.
+Now note that each of
+$g_n^+$, $\,-g_n^-$, and $0$
+are in $\symconv \cI$, so
+$f \in \overline\symconv \ \cI$
+under pointwise convergence.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_kmt_corollary}]
+
+We follow the Gaussian approximation method given in
+Section~2 of \citet{gine2004kernel}.
+The KMT approximation theorem \citep{komlos1975approximation}
+asserts the existence
+of a probability space
+carrying $n$ i.i.d.\ uniform random variables
+$\xi_1, \ldots, \xi_n \sim \Unif[0,1]$
+and a standard Brownian motion
+$B_n(s): s \in [0,1]$
+such that if
+%
+\begin{align*}
+\alpha_n(s)
+&\vcentcolon=
+\frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\big(
+\I\{\xi_i \leq s\} - s
+\big),
+&\beta_n(s)
+&\vcentcolon=
+B_n(s) - s B_n(1),
+\end{align*}
+%
+then
+for some universal positive constants
+$C_1$, $C_2$, $C_3$,
+and for all $t > 0$,
+%
+\begin{align*}
+\P\left(
+\sup_{s \in [0,1]}
+\big| \alpha_n(s) - \beta_n(s) \big|
+> \frac{t + C_1\log n}{\sqrt{n}}
+\right)
+\leq C_2 e^{-C_3 t}.
+\end{align*}
+%
+We can
+view $\alpha_n$ and $\beta_n$ as random functionals
+defined on the class of
+half-interval indicator functions
+$\cI = \big\{\I[0,s]: s \in [0,1]\big\}$
+in the following way.
+%
+\begin{align*}
+\alpha_n(\I[0,s])
+&= \frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\big( \I[0,s](\xi_i) - \E[\I[0,s](\xi_i)]), \\
+\beta_n(\I[0,s])
+&= \int_0^1 \I[0,s](u) \diff{B_n(u)}
+- B_n(1) \int_0^1 \I[0,s](u) \diff{u},
+\end{align*}
+%
+where the integrals are defined as It{\^o} and
+Riemann--Stieltjes integrals in
+the usual way for stochastic integration against semimartingales
+\citep[Chapter~5]{legall2016brownian}.
+Now we extend their definitions to the class
+$\cV_1$
+of functions on $[0,1]$
+which are 0 at 1 and have total variation bounded by 1.
+This is achieved by
+noting that by Lemma~\ref{lem:kernel_app_bv_characterization},
+we have
+$\cV_1 \subseteq \overline\symconv \ \cI$
+where $\overline{\symconv} \ \cI$ is the
+smallest
+symmetric convex class containing $\cI$
+which is closed under pointwise convergence.
+Thus by the dominated convergence theorem,
+every function in $\cV_1$ is approximated in $L^2$ by finite convex
+combinations of functions in $\pm\cI$,
+and the extension to $g \in \cV_1$ follows
+by linearity and $L^2$ convergence of (stochastic) integrals:
+%
+\begin{align*}
+\alpha_n(g)
+&=
+\frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\big( g(\xi_i) - \E[g(\xi_i)]),
+&\beta_n(g)
+&= \int_0^1 g(s) \diff{B_n(s)}
+- B_n(1) \int_0^1 g(s) \diff{s}.
+\end{align*}
+%
+Now we show that the norm induced on
+$(\alpha_n - \beta_n)$
+by the function class $\cV_1$ is a.s.\ identical to the
+supremum norm.
+Writing the sums as integrals and using integration by parts
+for finite-variation Lebesgue--Stieltjes and It\^o integrals,
+and recalling that $g(1) = \alpha_n(0) = B_n(0) = 0$,
+%
+\begin{align*}
+\sup_{g \in \cV_1}
+\big|\alpha_n(g) - \beta_n(g)\big|
+&=
+\sup_{g \in \cV_1}
+\left|
+\int_0^1 g(s) \diff{\alpha_n(s)}
+- \int_0^1 g(s) \diff{B_n(s)}
++ B_n(1) \int_0^1 g(s) \diff{s}
+\right| \\
+&=
+\sup_{g \in \cV_1}
+\left|
+\int_0^1 \alpha_n(s) \diff{g(s)}
+- \int_0^1 B_n(s) \diff{g(s)}
++ B_n(1) \int_0^1 s \diff{g(s)}
+\right| \\
+&=
+\sup_{g \in \cV_1}
+\left|
+\int_0^1 \big(\alpha_n(s) - \beta_n(s)\big)
+\diff{g(s)}
+\right|
+= \sup_{s \in [0,1]}
+\big|
+\alpha_n(s) - \beta_n(s)
+\big|,
+\end{align*}
+%
+where in the last line
+the upper bound is because $\|g\|_\TV \leq 1$,
+and the lower bound is by taking
+$g_\varepsilon = \pm \I[0,s_\varepsilon]$ where
+$|\alpha_n(s_\varepsilon) - \beta_n(s_\varepsilon)|
+\geq \sup_s |\alpha_n(s) - \beta_n(s)| -
+\varepsilon$.
+Hence we obtain
+%
+\begin{align}
+\label{eq:kernel_app_kmt_concentration}
+\P\left(
+\sup_{g \in \cV_1}
+\big|\alpha_n(g) - \beta_n(g)\big|
+> \frac{t + C_1\log n}{\sqrt{n}}
+\right)
+\leq C_2 e^{-C_3 t}.
+\end{align}
+%
+Now define $V_n = \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV$,
+noting that if $V_n = 0$ then the result is trivially true
+by setting $Z_n = 0$.
+Let $F_X$ be the common c.d.f.\ of $X_i$,
+and define the quantile function
+$F_X^{-1}(s) = \inf \{u: F_X(u) \geq s\}$ for $s \in [0,1]$,
+writing $\inf \emptyset = \infty$
+and $\inf \R = -\infty$.
+Consider the function class
+%
+\begin{align*}
+\cG_n = \big\{
+V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
+- V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
+: x \in \R \big\},
+\end{align*}
+%
+noting that $g_n(\cdot,x)$
+is finite-variation so
+$g_n(\pm \infty, x)$
+can be interpreted as
+the relevant limit.
+By monotonicity of $F_X$ and the definition of $V_n$,
+the members of $\cG_n$ have total variation of at most $1$
+and are 0 at 1, implying that
+$\cG_n \subseteq \cV_1$.
+Noting that $\alpha_n$ and $\beta_n$ are random
+linear operators which a.s.\ annihilate
+constant functions,
+define
+%
+\begin{align*}
+Z_n(x)
+&=
+\beta_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big)
+= V_n \beta_n \Big(
+V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
+- V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
+\Big),
+\end{align*}
+%
+which is a mean-zero continuous Gaussian process.
+Its covariance structure is
+%
+\begin{align*}
+&\E[Z_n(x) Z_n(x')] \\
+&=
+\E\bigg[
+\left(
+\int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
+- B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
+\right) \\
+&\quad\times
+\left(
+\int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
+- B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s}
+\right)
+\bigg] \\
+&=
+\E\left[
+\int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
+\int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
+\right] \\
+&\quad- \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \
+\E\left[
+B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)}
+\right] \\
+&\quad-
+\int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \
+\E\left[
+B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)}
+\right] \\
+&\quad+
+\int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
+\int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \
+\E\left[
+B_n(1)^2
+\right] \\
+&=
+\int_0^1 g_n\big(F_X^{-1}(s),x\big)
+g_n\big(F_X^{-1}(s),x'\big) \diff{s}
+- \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s}
+\int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \\
+&=
+\E\Big[
+g_n\big(F_X^{-1}(\xi_i), x\big)
+g_n\big(F_X^{-1}(\xi_i), x'\big)
+\Big]
+- \E\Big[
+g_n\big(F_X^{-1}(\xi_i), x\big)
+\Big]
+\E\Big[
+g_n\big(F_X^{-1}(\xi_i), x'\big)
+\Big] \\
+&=
+\E\Big[
+g_n\big(X_i, x\big)
+g_n\big(X_i, x'\big)
+\Big]
+- \E\Big[
+g_n\big(X_i, x\big)
+\Big]
+\E\Big[
+g_n\big(X_i, x'\big)
+\Big]
+=
+\E\big[
+G_n(x)
+G_n(x')
+\big]
+\end{align*}
+%
+as desired, by the It\^o isometry for stochastic integrals,
+writing $B_n(1) = \int_0^1 \diff{B_n(s)}$;
+and noting that $F_X^{-1}(\xi_i)$
+has the same distribution as $X_i$.
+Finally, note that
+%
+\begin{align*}
+G_n(x)
+&=
+\alpha_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big)
+= V_n \alpha_n \Big(
+V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big)
+- V_n^{-1} g_n\big(F_X^{-1}(1), x\big)
+\Big),
+\end{align*}
+%
+and so by \eqref{eq:kernel_app_kmt_concentration}
+%
+\begin{align*}
+\P\left(
+\sup_{x \in \R}
+\Big|G_n(x) - Z_n(x)\Big|
+> V_n \frac{t + C_1 \log n}{\sqrt n}
+\right)
+&\leq
+\P\left(
+\sup_{g \in \cV_1}
+\big|\alpha_n(g) - \beta_n(g)\big|
+> \frac{t + C_1\log n}{\sqrt{n}}
+\right) \\
+&\leq C_2 e^{-C_3 t}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_yurinskii_corollary}]
+
+Take $0 < \delta_n \leq \Leb(\cX_n)$ and let
+$\cX_n^\delta = \big\{ x_1, \dots, x_{|\cX_n^\delta|}\big\}$
+be a $\delta_n$-covering of $\cX_n$ with cardinality
+$|\cX_n^\delta| \leq \Leb(\cX_n)/\delta_n$.
+Suppose that $\left|\log \delta_n\right| \lesssim C_1 \log n$
+up to a universal constant.
+We first use the Yurinskii coupling to
+construct a Gaussian process
+$Z_n$
+which is close to $G_n$
+on this finite cover.
+Then we bound the fluctuations in $G_n$
+and in $Z_n$
+using entropy methods.
+
+\proofparagraph{Yurinskii coupling}
+
+Define the i.n.i.d.\
+and mean-zero variables
+%
+\begin{align*}
+h_i(x)
+&=
+\frac{1}{\sqrt n}
+\Big(
+g_n(X_i', x)
+- \E[g_n(X_i', x)]
+\Big),
+\end{align*}
+%
+where $X_1', \ldots, X_n'$
+are independent copies of $X_1, \ldots, X_n$
+on some new probability space,
+so that we have
+$G_n(x) = \sum_{i=1}^n h_i(x)$
+in distribution.
+Also define the length-$|\cX_n^\delta|$ random vector
+%
+\begin{align*}
+h_i^\delta
+&=
+\big(
+h_i(x): x \in \cX_n^\delta
+\big).
+\end{align*}
+%
+By an extension of
+Yurinskii's coupling
+to general norms
+\citep[supplemental materials, Lemma~38]{belloni2019conditional},
+there exists on the new probability space a
+Gaussian length-$|\cX_n^\delta|$ vector $Z_n^\delta$
+which is mean-zero
+and with the same covariance structure as
+$
+\sum_{i=1}^n
+h_i^\delta
+$
+satisfying
+%
+\begin{align*}
+\P\left(
+\bigg\|
+\sum_{i=1}^n
+h_i^\delta
+- Z_n^\delta
+\bigg\|_\infty
+> 3 t_n
+\right)
+\leq
+\min_{s > 0}
+\left(
+2 \P\big( \|N\|_\infty > s)
++ \frac{\beta s^2}{t_n^3}
+\right),
+\end{align*}
+%
+where
+%
+\begin{align*}
+\beta
+= \sum_{i=1}^n
+\Big(
+\E\big[\|h_i^\delta\|_2^2 \,
+\|h_i^\delta\|_\infty
+\big]
++ \E\big[\|z_i\|_2^2 \,
+\|z_i\|_\infty
+\big]
+\Big),
+\end{align*}
+%
+with $z_i \sim \cN(0, \Var[h_i^\delta])$
+independent and $N \sim \cN(0, I_{|\cX_n^\delta|})$.
+By the bounds on $g_n$,
+%
+\begin{align*}
+\E\big[\|h_i^\delta\|_2^2 \,
+\|h_i^\delta\|_\infty \,
+\big]
+\leq
+\frac{M_n}{\sqrt n}
+\E\big[\|h_i^\delta\|_2^2 \,
+\big]
+=
+\frac{M_n}{\sqrt n}
+\sum_{x \in \cX_n^\delta}
+\E\big[h_i(x)^2 \,
+\big]
+\leq
+\frac{M_n}{\sqrt n}
+\frac{|\cX_n^\delta| \sigma_n^2}{n}
+\leq
+\frac{M_n \sigma_n^2 \Leb(\cX_n)}{n^{3/2}\delta_n}.
+\end{align*}
+%
+By the fourth moment bound for Gaussian variables,
+%
+\begin{align*}
+\E\big[
+\|z_i\|_2^4 \,
+\big]
+&\leq
+|\cX_n^\delta| \,
+\E\big[
+\|z_i\|_4^4
+\big]
+\leq
+|\cX_n^\delta|^2 \,
+\max_j
+\E\big[
+(z_i^{(j)})^4
+\big]
+\leq
+3
+|\cX_n^\delta|^2 \,
+\max_j
+\E\big[
+(z_i^{(j)})^2
+\big]^2 \\
+&=
+3
+|\cX_n^\delta|^2 \,
+\max_{x \in \cX_n^\delta}
+\E\big[
+h_i(x)^2
+\big]^2
+\leq
+\frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2} .
+\end{align*}
+%
+Also by Jensen's inequality
+and for $|\cX_n^\delta| \geq 2$,
+assuming $C_1 > 1$ without loss of generality,
+%
+\begin{align*}
+\E\big[
+\|z_i\|_\infty^2
+\big]
+&\leq
+\frac{4 \sigma_n^2}{n}
+\log
+\E\big[
+e^{\|z_i\|_\infty^2 / (4\sigma_n^2/n)}
+\big]
+\leq
+\frac{4 \sigma_n^2}{n}
+\log
+\E\left[
+\sum_{j=1}^{|\cX_n^\delta|}
+e^{(z_i^{(j)})^2 / (4\sigma_n^2/n)}
+\right]
+\leq
+\frac{4\sigma_n^2}{n}
+\log \big(2|\cX_n^\delta|\big) \\
+&\leq
+\frac{4\sigma_n^2}{n}
+\left(
+\log 2 + \log \Leb(\cX_n) - \log \delta_n
+\right)
+\leq
+\frac{12 C_1 \sigma_n^2 \log n}{n},
+\end{align*}
+%
+where we used the moment
+generating function of a $\chi_1^2$ random variable.
+Therefore we can apply the Cauchy--Schwarz inequality
+to obtain
+%
+\begin{align*}
+\E\big[\|z_i\|_2^2 \,
+\|z_i\|_\infty
+\big]
+&\leq
+\sqrt{
+\E\big[\|z_i\|_2^4
+\big]}
+\sqrt{
+\E\big[
+\|z_i\|_\infty^2
+\big]}
+\leq
+\sqrt{
+\frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2}}
+\sqrt{ \frac{12 C_1 \sigma_n^2 \log n}{n} } \\
+&\leq
+\frac{6\sigma_n^3
+\Leb(\cX_n)
+\sqrt{C_1 \log n}}{n^{3/2} \delta_n}.
+\end{align*}
+%
+Now summing over the $n$ samples gives
+%
+\begin{align*}
+\beta
+\leq
+\frac{M_n \sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n}
++ \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}}
+{\sqrt n \delta_n}
+=
+\frac{\sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n}
+\Big(M_n + 6\sigma_n \sqrt{C_1 \log n}\Big).
+\end{align*}
+%
+By a union bound
+and Gaussian tail probabilities,
+we have that
+$\P\big( \|N\|_\infty > s)
+\leq 2|\cX_n^\delta| e^{-s^2/2}$.
+Thus we get the following Yurinskii coupling inequality
+for all $s > 0$:
+%
+\begin{align*}
+\P\left(
+\bigg\|
+\sum_{i=1}^n
+h_i^\delta
+- Z_n^\delta
+\bigg\|_\infty
+> t_n
+\right)
+&\leq
+\frac{4 \Leb(\cX_n)}{\delta_n}
+e^{-s^2/2}
++ \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
+\Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big).
+\end{align*}
+%
+Note that
+$Z_n^\delta$
+now extends
+by the Vorob'ev--Berkes--Philipp theorem
+(Lemma~\ref{lem:kernel_app_vbp})
+to a mean-zero Gaussian
+process
+$Z_n$ on the compact interval $\cX_n$
+with covariance structure
+%
+\begin{align*}
+\E\big[
+Z_n(x)
+Z_n(x')
+\big]
+=
+\E\big[
+G_n(x)
+G_n(x')
+\big],
+\end{align*}
+%
+satisfying for any $s' > 0$
+%
+\begin{align*}
+&\P\left(
+\sup_{x \in \cX_n^\delta}
+\big|
+G_n(x) - Z_n(x)
+\big|
+> t_n
+\right)
+\leq
+\frac{4 \Leb(\cX_n)}{\delta_n}
+e^{-s^2/2}
++ \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
+\Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big).
+\end{align*}
+
+\proofparagraph{regularity of $G_n$}
+
+Next we bound the fluctuations in
+the empirical process $G_n$.
+Consider the following classes of functions on $S$
+and their associated (constant) envelope functions.
+By continuity of $g_n$,
+each class is pointwise measurable
+(to see this, restrict the index sets to rationals).
+%
+\begin{align*}
+\cG_n
+&=
+\big\{
+g_n(\cdot, x):
+x \in \cX_n
+\big\},
+&\Env(\cG_n)
+&=
+M_n, \\
+\cG_n^\delta
+&=
+\big\{
+g_n(\cdot, x)
+- g_n(\cdot, x'):
+x, x' \in \cX_n,
+|x-x'| \leq \delta_n
+\big\},
+&\Env(\cG_n^\delta)
+&=
+l_{n,\infty} \delta_n.
+\end{align*}
+%
+We first show these are VC-type.
+By the uniform Lipschitz assumption,
+%
+\begin{align*}
+\big\|
+g_n(\cdot, x)
+- g_n(\cdot, x')
+\big\|_\infty
+&\leq l_{n,\infty} |x-x'|
+\end{align*}
+%
+for all $x,x' \in \cX_n$.
+Therefore, with $\Q$ ranging over the
+finitely-supported distributions
+on $(S, \cS)$,
+noting that any $\|\cdot\|_\infty$-cover
+is a $\rho_\Q$-cover,
+%
+\begin{align*}
+\sup_\Q
+N\big(\cG_n, \rho_\Q, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big)
+&\leq
+N\big(\cG_n, \|\cdot\|_\infty,
+\varepsilon l_{n,\infty} \!\Leb(\cX_n)\big)
+\leq
+N\big(\cX_n, |\cdot|, \varepsilon \!\Leb(\cX_n)\big)
+\leq
+1/\varepsilon.
+\end{align*}
+%
+Replacing $\varepsilon$ by
+$\varepsilon M_n/(l_{n,\infty} \Leb(\cX_n))$
+gives
+%
+\begin{align*}
+\sup_\Q
+N\big(\cG_n, \rho_\Q, \varepsilon M_n \big)
+&\leq
+\frac{l_{n,\infty} \Leb(\cX_n)}{\varepsilon M_n},
+\end{align*}
+%
+and so $\cG_n$
+is a VC-type class.
+To see that $\cG_n^\delta$
+is also a VC-type class,
+we construct a cover in the following way.
+Let $\cF_n$ be an $\varepsilon$-cover
+for $(\cG_n, \|\cdot\|_\infty)$.
+By the triangle inequality,
+$\cF_n - \cF_n$ is a $2\varepsilon$-cover
+for $(\cG_n - \cG_n, \|\cdot\|_\infty)$
+of cardinality at most $|\cF_n|^2$,
+where the subtractions are set subtractions.
+Since $\cG_n^\delta \subseteq \cG_n - \cG_n$,
+we see that $\cF_n - \cF_n$ is a $2\varepsilon$-external cover
+for $\cG_n^\delta$. Thus
+%
+\begin{align*}
+\sup_\Q
+N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \Leb(\cX_n)\big)
+&\leq
+N\big(\cG_n^\delta, \|\cdot\|_\infty,
+\varepsilon l_{n,\infty} \Leb(\cX_n)\big) \\
+&\leq
+N\big(\cG_n, \|\cdot\|_\infty,
+\varepsilon l_{n,\infty} \Leb(\cX_n)\big)^2
+\leq
+1/\varepsilon^2.
+\end{align*}
+%
+Replacing $\varepsilon$ by
+$\varepsilon \delta_n/\Leb(\cX_n)$
+gives
+%
+\begin{align*}
+\sup_\Q
+N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \delta_n \big)
+&\leq
+\frac{\Leb(\cX_n)^2}{\varepsilon^2 \delta_n^2}
+\leq
+(C_{1,n}/\varepsilon)^{2}
+\end{align*}
+%
+with $C_{1,n} = \Leb(\cX_n) / \delta_n$,
+demonstrating that $\cG_n^\delta$
+forms a VC-type class.
+We now apply the maximal inequality
+for i.n.i.d.\ data
+given in
+Lemma~\ref{lem:kernel_app_maximal_vc_inid}.
+To do this,
+note that
+$\sup_{\cG_n^\delta} \|g\|_{\bar\P,2}
+\leq l_{n,2} \delta_n$
+by the $L^2$ Lipschitz condition, and recall
+$\Env(\cG_n^\delta) = l_{n,\infty} \delta_n$.
+Therefore Lemma~\ref{lem:kernel_app_maximal_vc_inid} with
+$\|F\|_{\bar\P,2} = l_{n,\infty} \delta_n$,
+$\|M\|_{\P,2} = l_{n,\infty} \delta_n$,
+and $\sigma = l_{n,2} \delta_n$
+gives,
+up to universal constants
+%
+\begin{align*}
+&\E\left[
+\sup_{g \in \cG_n^\delta}
+\left|
+\frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\Big(
+g(X_i)
+- \E[g(X_i)]
+\Big)
+\right|
+\right] \\
+&\quad\lesssim
+\sigma
+\sqrt{2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)}
++
+\frac{\|M\|_{\P,2} 2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)}
+{\sqrt{n}} \\
+&\quad\lesssim
+l_{n,2} \delta_n
+\sqrt{C_1 \log n}
++
+\frac{l_{n,\infty} \delta_n}{\sqrt n}
+C_1 \log n,
+\end{align*}
+%
+and hence by Markov's inequality,
+%
+\begin{align*}
+&\P\left(
+\sup_{|x-x'| \leq \delta_n}
+\big|
+G_n(x) - G_n(x')
+\big|
+> t_n
+\right) \\
+&=
+\P\left(
+\sup_{|x-x'| \leq \delta_n}
+\frac{1}{\sqrt{n}}
+\left|
+\sum_{i=1}^n
+\Big(
+g_n(X_i, x) - \E[g_n(X_i, x)]
+- g_n(X_i, x') + \E[g_n(X_i, x')]
+\Big)
+\right|
+> t_n
+\right) \\
+&=
+\P\left(
+\sup_{g \in \cG_n^\delta}
+\left|
+\frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\Big(
+g(X_i) - \E[g(X_i)]
+\Big)
+\right|
+> t_n
+\right)
+\leq
+\frac{1}{t}
+\E\left[
+\sup_{g \in \cG_n^\delta}
+\left|
+\frac{1}{\sqrt{n}}
+\sum_{i=1}^n
+\Big(
+g(X_i) - \E[g(X_i)]
+\Big)
+\right|
+\right] \\
+&\lesssim
+\frac{l_{n,2} \delta_n}{t_n}
+\sqrt{C_1 \log n}
++ \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n.
+\end{align*}
+
+\proofparagraph{regularity of $Z_n$}
+
+Next we bound the fluctuations in the Gaussian process
+$Z_n$.
+Let $\rho$ be the following semimetric:
+%
+\begin{align*}
+\rho(x, x')^2
+&=
+\E\big[\big( Z_n(x) - Z_n(x') \big)^2\big]
+=
+\E\big[\big( G_n(x) - G_n(x') \big)^2\big] \\
+&=
+\frac{1}{n}
+\sum_{i=1}^n
+\E\big[\big( h_i(x) - h_i(x') \big)^2\big]
+\leq
+l_{n,2}^2 \, |x - x'|^2.
+\end{align*}
+%
+Hence
+$\rho(x, x')
+\leq
+l_{n,2} \, |x - x'|$.
+By
+the Gaussian process maximal inequality from
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+we obtain that
+%
+\begin{align*}
+&\E\bigg[
+\sup_{|x - x'| \leq \delta_n}
+\big|
+Z_n(x) - Z_n(x')
+\big|
+\bigg]
+\lesssim
+\E\bigg[
+\sup_{\rho(x,x') \leq l_{n,2} \delta_n}
+\big|
+Z_n(x) - Z_n(x')
+\big|
+\bigg] \\
+&\quad\leq
+\int_0^{l_{n,2} \delta_n}
+\sqrt{\log N(\varepsilon, \cX_n, \rho)}
+\diff{\varepsilon}
+\leq
+\int_0^{l_{n,2} \delta_n}
+\sqrt{\log N(\varepsilon / l_{n,2}, \cX_n, |\cdot|)}
+\diff{\varepsilon} \\
+&\quad\leq
+\int_0^{l_{n,2} \delta_n}
+\sqrt{\log \left( 1 + \frac{\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)}
+\diff{\varepsilon}
+\leq
+\int_0^{l_{n,2} \delta_n}
+\sqrt{\log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)}
+\diff{\varepsilon} \\
+&\quad\leq
+\log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
+\int_0^{l_{n,2} \delta_n}
+\log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)
+\diff{\varepsilon} \\
+&\quad=
+\log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
+\left(
+l_{n,2} \delta_n \log \left( 2 \Leb(\cX_n) l_{n,2} \right)
++ l_{n,2} \delta_n
++ l_{n,2} \delta_n \log \left( \frac{1}{l_{n,2} \delta_n} \right)
+\right) \\
+&\quad=
+\log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2}
+l_{n,2} \delta_n
+\left(
+1 +
+\log \left( \frac{2\Leb(\cX_n)}{\delta_n} \right)
+\right)
+\lesssim
+l_{n,2} \delta_n
+\sqrt{\log \left( \frac{\Leb(\cX_n)}{\delta_n} \right)} \\
+&\quad\lesssim
+l_{n,2} \delta_n
+\sqrt{C_1 \log n},
+\end{align*}
+%
+where we used that $\delta_n \leq \Leb(\cX_n)$.
+So by Markov's inequality,
+%
+\begin{align*}
+\P\left(
+\sup_{|x - x'| \leq \delta_n}
+\big|
+Z_n(x) - Z_n(x')
+\big|
+> t_n
+\right)
+&\lesssim
+t_n^{-1}
+l_{n,2} \delta_n
+\sqrt{C_1 \log n}.
+\end{align*}
+
+\proofparagraph{conclusion}
+
+By the results of the previous parts,
+we have up to universal constants that
+%
+\begin{align*}
+&\P\left(
+\sup_{x \in \cX_n}
+\big|
+G_n(x) - Z_n(x)
+\big|
+> t_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{x \in \cX_n^\delta}
+\big|
+G_n(x) - Z_n(x)
+\big|
+> t_n / 3
+\right)
++ \P\left(
+\sup_{|x-x'| \leq \delta_n}
+\big|
+G_n(x) - G_n(x')
+\big|
+> t_n / 3
+\right) \\
+&\qquad+
+\P\left(
+\sup_{|x - x'| \leq \delta_n}
+\big|
+Z_n(x) - Z_n(x')
+\big|
+> t_n / 3
+\right) \\
+&\quad\lesssim
+\frac{4 \Leb(\cX_n)}{\delta_n}
+e^{-s^2/2}
++ \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
+\Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\
+&\qquad+
+\frac{l_{n,2} \delta_n}{t_n}
+\sqrt{C_1 \log n}
++ \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n.
+\end{align*}
+%
+Choosing an approximately optimal mesh size of
+%
+\begin{align*}
+\delta_n
+&=
+\sqrt{
+\frac{\sigma_n^2 \Leb(\cX_n) \log n}{\sqrt n t_n^3}
+\Big(M_n + \sigma_n \sqrt{\log n}\Big)
+} \Bigg/
+\sqrt{
+t_n^{-1}
+l_{n,2}
+\sqrt{\log n}
+\left(
+1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt{n}}
+\right)
+}
+\end{align*}
+%
+gives $\log |\delta_n| \lesssim C_1 \log n$ for a universal constant,
+so with $s$ a large enough multiple of $\sqrt{\log n}$,
+%
+\begin{align*}
+&\P\left(
+\sup_{x \in \cX_n}
+\big|
+G_n(x) - Z_n(x)
+\big|
+> t_n
+\right) \\
+&\quad\lesssim
+\frac{4 \Leb(\cX_n)}{\delta_n}
+e^{-s^2/2}
++ \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3}
+\Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\
+&\qquad+
+\frac{l_{n,2} \delta_n}{t_n}
+\sqrt{C_1 \log n}
++ \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n \\
+&\quad\lesssim
+\delta_n
+\frac{l_{n,2} \sqrt {\log n}}{t_n}
+\left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt n} \right) \\
+&\quad\lesssim
+\frac{\sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n}
+\sqrt{M_n + \sigma_n \sqrt{\log n}}}
+{n^{1/4} t_n^2}
+\sqrt{l_{n,2} \sqrt {\log n}
++ \frac{l_{n,\infty}}{\sqrt n} \log n}.
+\end{align*}
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_vbp}]
+
+The proof is by induction on the number of vertices in the tree.
+Let $\cT$ have $n$ vertices,
+and suppose that vertex $n$ is a leaf
+connected to vertex $n-1$ by an edge,
+relabeling the vertices if necessary.
+By the induction hypothesis we assume that there is a
+probability measure $\P^{(n-1)}$
+on $\prod_{i=1}^{n-1} \cX_i$
+whose projections onto $\cX_i$ are $\P_i$
+and whose projections onto $\cX_i \times \cX_j$ are $\P_{i j}$,
+for $i,j \leq n-1$.
+Now apply the original
+Vorob'ev--Berkes--Philipp theorem,
+which can be found as Theorem~1.1.10 in
+\citet{dudley1999uniform},
+to the spaces
+$\prod_{i=1}^{n-2} \cX_i$,\,
+$\cX_{n-1}$, and
+$\cX_n$;
+and to the laws
+$\P^{(n-1)}$
+and
+$\P_{n-1, n}$.
+This gives a law $\P^{(n)}$
+which agrees with $\P_i$
+at every vertex by definition,
+and agrees with
+$\P_{i j}$ for all $i,j \leq n-1$.
+It also agrees with $\P_{n-1,n}$,
+and this is the only edge touching vertex $n$.
+Hence $\P^{(n)}$ satisfies the desired properties.
+\end{proof}
+
+\subsection{Main results}
+\label{sec:kernel_app_main}
+
+We give supplementary details for our main results on consistency, minimax
+optimality, strong approximation, covariance estimation, feasible inference and
+counterfactual estimation.
+We begin with a basic fact about Lipschitz functions.
+
+\begin{lemma}[Lipschitz kernels are bounded]
+\label{lem:kernel_app_lipschitz_kernels_bounded}
+
+Let $\cX \subseteq \R$ be a connected set.
+Let $f: \cX \to \R$ satisfy the Lipschitz condition
+$|f(x) - f(x')| \leq C |x-x'|$ for some $C > 0$
+and all $x, x' \in \cX$.
+Suppose also that $f$ is a kernel in the sense that
+$\int_\cX f(x) \diff{x} = 1$.
+Then we have
+%
+\begin{align*}
+\sup_{x \in \cX} |f(x)|
+&\leq
+C \Leb(\cX) + \frac{1}{\Leb(\cX)}.
+\end{align*}
+%
+Now let $g: \cX \to [0,\infty)$ satisfy
+$|g(x) - g(x')| \leq C |x-x'|$ for some $C > 0$
+and all $x, x' \in \cX$.
+Suppose $g$ is a sub-kernel with
+$\int_\cX g(x) \diff{x} \leq 1$.
+Then for any $M \in \big(0, \Leb(\cX)\big]$,
+we have
+%
+\begin{align*}
+\sup_{x \in \cX} f(x)
+&\leq
+C M + \frac{1}{M}.
+\end{align*}
+
+\end{lemma}
+
+Applying Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
+to the density and kernel functions defined in
+Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}
+yields the following.
+Firstly, since $k_h(\cdot, w)$ is $C_\rL / h^2$-Lipschitz
+on $[w \pm h] \cap \cW$ and integrates to one,
+we have by the first inequality in
+Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} that
+%
+\begin{align*}
+|k_h(s,w)|
+&\leq \frac{2 C_\rL + 1}{h} + \frac{1}{\Leb(\cW)}.
+\end{align*}
+%
+Since each of
+$f_{W \mid AA}(\cdot \mid a,a')$,
+$f_{W \mid A}(\cdot \mid a)$, and
+$f_W$ is non-negative, and $C_\rH$-Lipschitz on $\cW$
+and integrates to at most one over $\cW$,
+taking $M = \frac{1}{\sqrt{C_\rH}} \wedge \Leb(\cW)$
+in the second inequality in
+Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}
+gives
+%
+\begin{align*}
+f_{W \mid AA}(w \mid a,a')
+&\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\
+f_{W \mid A}(w \mid a)
+&\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\
+f_W(w)
+&\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}.
+\end{align*}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}]
+
+We begin with the first inequality.
+Note that if $\Leb(\cX) = \infty$ there is nothing to prove.
+Suppose for contradiction that
+$|f(x)| > C \Leb(\cX) + \frac{1}{\Leb(\cX)}$
+for some $x \in \cX$.
+If $f(x) \geq 0$
+then by the Lipschitz property, for any $y \in \cX$,
+%
+\begin{align*}
+f(y)
+\geq f(x) - C|y-x|
+> C \Leb(\cX) + \frac{1}{\Leb(\cX)} - C\Leb(\cX)
+= \frac{1}{\Leb(\cX)}.
+\end{align*}
+%
+Similarly, if $f(x) \leq 0$ then
+%
+\begin{align*}
+f(y)
+\leq f(x) + C|y-x|
+< - C \Leb(\cX) - \frac{1}{\Leb(\cX)} + C\Leb(\cX)
+= -\frac{1}{\Leb(\cX)}.
+\end{align*}
+%
+But then either
+$\int_\cX f(x) \diff{x} > \int_\cX 1/\Leb(\cX) \diff{x} = 1$
+or
+$\int_\cX f(x) \diff{x} < \int_\cX -1/\Leb(\cX) \diff{x} = -1 < 1$,
+giving a contradiction.
+
+For the second inequality,
+assume that $f$ is non-negative on $\cX$,
+and take $M \in \big(0, \Leb(\cX)\big]$.
+Suppose for contradiction that
+$f(x) > C M + \frac{1}{M}$
+for some $x \in \cX$.
+Then by the Lipschitz property, $f(y) \geq 1/M$
+for all $y$ such that $|y - x| \leq M$.
+Since $\cX$ is connected, we have
+$\Leb(\cX \cap [x \pm M]) \geq M$
+and so we deduce that
+$\int_\cX f(x) \diff{x} > M/M = 1$
+which is a contradiction.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_bias}]
+
+Begin by defining
+%
+\begin{align*}
+P_p(s,w)
+&=
+\sum_{r = 0}^p
+\frac{f_W^{(r)}(w)}{r!}
+{(s-w)^r}
+\end{align*}
+%
+for $s, w \in \cW$
+as the degree-$p$ Taylor polynomial of $f_W$,
+centered at $w$ and evaluated at $s$.
+Note that
+for $p \leq \flbeta-1$,
+by Taylor's theorem with Lagrange remainder,
+%
+\begin{align*}
+f_W(s) - P_p(s,w)
+&=
+\frac{f_W^{(p+1)}(w')}{(p+1)!}
+(s-w)^{p+1}
+\end{align*}
+%
+for some $w'$ between $w$ and $s$.
+Also note that for any $p$,
+%
+\begin{align*}
+\int_{\cW}
+k_h(s,w)
+\big(
+P_p(s,w)
+- P_{p-1}(s,w)
+\big)
+\diff{s}
+&=
+\int_{\cW}
+k_h(s,w)
+\frac{f_W^{(p)}(w)}{p!}
+(s-w)^p
+\diff{s}
+= h^p b_p(w).
+\end{align*}
+%
+Further, by the order of the kernel,
+%
+\begin{align*}
+\E\big[\hat f_W(w)\big]
+- f_W(w)
+&=
+\int_{\cW}
+k_h(s,w)
+f_W(s)
+\diff{s}
+- f_W(w)
+=
+\int_{\cW}
+k_h(s,w)
+\big(f_W(s) - f_W(w)\big)
+\diff{s} \\
+&=
+\int_{\cW}
+k_h(s,w)
+\big(f_W(s) - P_{p-1}(s,w)\big)
+\diff{s}.
+\end{align*}
+
+\proofparagraph{low-order kernel}
+Suppose that $p \leq \flbeta - 1$. Then
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\big|
+\E[\hat f_W(w)]
+- f_W(w)
+- h^p b_p(w)
+\big| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(f_W(s) - P_{p-1}(s,w)\big)
+\diff{s}
+- h^p b_p(w)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(
+f_W(s) - P_{p}(s,w)
++ P_{p}(s,w) - P_{p-1}(s,w)
+\big)
+\diff{s}
+- h^p b_p(w)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(
+f_W(s) - P_{p}(s,w)
+\big)
+\diff{s}
+\right|
+= \sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\frac{f_W^{(p+1)}(w')}{(p+1)!}
+(s-w)^{p+1}
+\diff{s}
+\right| \\
+&\quad\leq
+\sup_{w \in \cW}
+\left|
+\int_{[w \pm h]}
+\frac{C_\rk}{h}
+\frac{C_\rH}{(p+1)!}
+h^{p+1}
+\diff{s}
+\right|
+\leq
+\frac{2C_\rk C_\rH}{(p+1)!}
+h^{p+1}.
+\end{align*}
+
+\proofparagraph{order of kernel matches smoothness}
+Suppose that $p = \flbeta$.
+Then
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\big|
+\E[\hat f_W(w)]
+- f_W(w)
+- h^p b_p(w)
+\big| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(f_W(s) - P_{\flbeta - 1}(s,w)\big)
+\diff{s}
+- h^p b_p(w)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(
+f_W(s) - P_{\flbeta}(s,w)
++ P_{\flbeta}(s,w) - P_{\flbeta - 1}(s,w)
+\big)
+\diff{s}
+- h^{\flbeta} b_{\flbeta}(w)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\big(
+f_W(s) - P_{\flbeta}(s,w)
+\big)
+\diff{s}
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+\frac{f_W^{(\flbeta)}(w') - f_W^{(\flbeta)}(w)}{\flbeta!}
+(s-w)^{\flbeta}
+\diff{s}
+\right| \\
+&\quad\leq
+\sup_{w \in \cW}
+\left|
+\int_{[w \pm h]}
+\frac{C_\rk}{h}
+\frac{C_\rH h^{\beta - \flbeta}}{\flbeta !}
+h^{\flbeta}
+\diff{s}
+\right|
+\leq
+\frac{2 C_\rk C_\rH}{\flbeta !}
+h^\beta.
+\end{align*}
+
+\proofparagraph{high-order kernel}
+Suppose that $p \geq \flbeta+1$.
+Then as in the previous part
+%
+\begin{align*}
+\sup_{w \in \cW}
+\big|
+\E[\hat f_W(w)]
+- f_W(w)
+\big|
+&=
+\sup_{w \in \cW}
+\left|
+\int_{[w \pm h] \cap \cW}
+\!\!\!\! k_h(s,w)
+\big(
+f_W(s) - P_{\flbeta}(s,w)
+\big)
+\diff{s}
+\right|
+\leq
+\frac{2 C_\rk C_\rH}{\flbeta !}
+h^\beta.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_hoeffding}]
+
+\proofparagraph{Hoeffding-type decomposition}
+
+\begin{align*}
+\hat f_W(w)
+- E_n(w)
+- \E[\hat f_W(w)]
+&=
+\frac{2}{n(n-1)}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+\Big(
+\E[k_h(W_{i j},w) \mid A_i, A_j]
+- \E[k_h(W_{i j},w)]
+\Big) \\
+&=
+\frac{1}{n(n-1)}
+\sum_{i=1}^{n-1}
+\sum_{j \neq i}
+\Big(
+\E[k_h(W_{i j},w) \mid A_i, A_j]
+- \E[k_h(W_{i j},w)]
+\Big),
+\end{align*}
+%
+and apply Lemma~\ref{lem:kernel_app_general_hoeffding} with
+%
+\begin{align*}
+u_{i j}
+&=
+\frac{1}{n(n-1)}
+\E\big[k_h(W_{i j},w) \mid A_i, A_j\big],
+&u_i
+&=
+\frac{1}{n(n-1)}
+\E\big[k_h(W_{i j},w) \mid A_i\big], \\
+u
+&=
+\frac{1}{n(n-1)}
+\E\big[k_h(W_{i j},w)\big],
+\end{align*}
+%
+to see
+%
+\begin{align*}
+\hat f_W(w)
+- E_n(w)
+- \E[\hat f_W(w)]
+&=
+\frac{2}{n}
+\sum_{i=1}^n
+\big(u_i - u\big)
++ \frac{1}{n(n-1)}
+\sum_{i=1}^n
+\sum_{j \neq i}
+\big(
+u_{i j} - u_i - u_j + u
+\big) \\
+&=
+\frac{2}{n}
+\sum_{i=1}^n
+l_i(w)
++ \frac{2}{n(n-1)}
+\sum_{i=1}^n
+\sum_{j = i+1}^n
+q_{i j}(w)
+=
+L_n + Q_n.
+\end{align*}
+
+\proofparagraph{expectation and covariance of $L_n$, $Q_n$, and $E_n$}
+
+$L_n$, $Q_n$, and $E_n$
+are clearly mean-zero.
+For orthogonality,
+note that their summands
+have the following properties,
+for any $1 \leq i < j \leq n$
+and $1 \leq r < s \leq n$,
+and for any $w, w' \in \cW$:
+%
+\begin{align*}
+\E\big[
+l_i(w)
+q_{rs}(w')
+\big]
+&=
+\E\big[
+l_i(w)
+\E\big[
+q_{rs}(w') \mid A_i
+\big]
+\big]
+= 0, \\
+\E\big[
+l_i(w)
+e_{rs}(w')
+\big]
+&=
+\begin{cases}
+\E\big[
+l_i(w)
+\big]
+\E\big[
+e_{rs}(w')
+\big],
+\text{ if } i \notin \{r,s\}, \\
+\E\big[
+l_i(w)
+\E\big[
+e_{rs}(w') \mid A_r, A_s
+\big]
+\big],
+\text{ if } i \in \{r,s\},
+\end{cases} \\
+&=
+0, \\
+\E\big[
+q_{i j}(w)
+e_{rs}(w')
+\big]
+&=
+\begin{cases}
+\E\big[
+q_{i j}(w)
+\big]
+\E\big[
+e_{rs}(w')
+\big],
+\text{ if } \{i,j\} \cap \{r,s\} = \emptyset, \\
+\E\big[
+\E\big[
+q_{i j}(w) \mid A_i
+\big]
+\E\big[
+e_{rs}(w') \mid A_i
+\big]
+\big],
+\text{ if } \{i,j\} \cap \{r,s\} = \{i\}, \\
+\E\big[
+\E\big[
+q_{i j}(w) \mid A_j
+\big]
+\E\big[
+e_{rs}(w') \mid A_j
+\big]
+\big],
+\text{ if } \{i,j\} \cap \{r,s\} = \{j\}, \\
+\E\big[
+q_{i j}(w)
+\E\big[
+e_{rs}(w') \mid A_r, A_s
+\big]
+\big],
+\text{ if } \{i,j\} = \{r,s\},
+\end{cases} \\
+&=
+0,
+\end{align*}
+%
+by independence of $\bA_n$ and $\bV_n$
+and as $\E[q_{rs}(w) \mid A_i] = 0$
+and $\E[e_{i j}(w) \mid A_i, A_j] = 0$.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_trichotomy}]
+
+\proofparagraph{total degeneracy}
+
+Suppose
+$\Dl = 0$, so
+$\Var[f_{W \mid A}(w \mid A_i)] = 0$
+for all $w \in \cW$.
+Therefore, for all $w \in \cW$,
+we have $f_{W \mid A}(w) = f_W(w)$ almost surely.
+By taking a union over $\cW \cap \Q$
+and by continuity of $f_{W \mid A}$ and $f_W$,
+this implies that $f_{W \mid A}(w) = f_W(w)$
+for all $w \in \cW$
+almost surely. Thus
+%
+\begin{align*}
+\E\left[
+k_h(W_{i j},w) \mid A_i
+\right]
+&=
+\int_{\cW}
+k_h(s,w)
+f_{W \mid A}(s \mid A_i)
+\diff{s}
+=
+\int_{\cW}
+k_h(s,w)
+f_W(s)
+\diff{s}
+=
+\E\left[
+k_h(W_{i j},w)
+\right]
+\end{align*}
+%
+for all $w \in \cW$ almost surely.
+Hence $l_i(w) = 0$ and so $L_n(w) = 0$
+for all $w \in \cW$ almost surely.
+
+\proofparagraph{no degeneracy}
+
+Suppose $\Dl > 0$.
+As $f_{W|A}(\cdot \mid a)$ is $C_\rH$-Lipschitz
+for all $a \in \cA$ and since $|k_h| \leq C_\rk/h$,
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left|
+\E[k_h(W_{i j},w) \mid A_i]
+- f_{W \mid A}(w \mid A_i)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW}
+k_h(s,w)
+f_{W \mid A}(s \mid A_i)
+\diff{s}
+- f_{W \mid A}(w \mid A_i)
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\int_{\cW \cap [w \pm h]}
+k_h(s,w)
+\left(
+f_{W \mid A}(s \mid A_i)
+- f_{W \mid A}(w \mid A_i)
+\right)
+\diff{s}
+\right| \\
+&\quad\leq
+2h
+\frac{C_\rk}{h}
+C_\rH h
+\leq
+2 C_\rk C_\rH h
+\end{align*}
+%
+almost surely.
+Therefore, since $f_{W \mid A}(w \mid a) \leq C_\rd$,
+we have
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+\Var\big[
+\E[k_h(W_{i j},w) \mid A_i]
+\big]
+- \Var\left[
+f_{W \mid A}(w \mid A_i)
+\right]
+\right|
+&\leq
+16 C_\rk C_\rH C_\rd h
+\end{align*}
+%
+whenever $h$ is small enough that
+$2 C_\rk C_\rH h \leq C_\rd$. Thus
+%
+\begin{align*}
+\inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
+&\geq
+\inf_{w \in \cW}\Var[f_{W \mid A}(w \mid A_i)]
+- 16 C_\rk C_\rH C_\rd h.
+\end{align*}
+%
+Therefore, if $\Dl > 0$, then eventually
+$\inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \Dl/2$.
+Finally,
+%
+\begin{align*}
+\inf_{w \in \cW}\Var[L_n(w)]
+&=
+\frac{4}{n}
+\inf_{w \in \cW}
+\Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
+\geq
+\frac{2 \Dl}{n}.
+\end{align*}
+
+\proofparagraph{partial degeneracy}
+
+Since $f_{W \mid A}(w \mid A_i)$
+is bounded by $C_\rd$ and $C_\rH$-Lipschitz in $w$,
+we have that
+$\Var[f_{W \mid A}(w \mid A_i)]$
+is continuous on $\cW$.
+Thus if $\Dl = 0$,
+there is at least one point $w \in \cW$
+for which
+$\Var[f_{W \mid A}(w \mid A_i)] = 0$
+by compactness.
+Let $w$ be any such degenerate point.
+Then by the previous part,
+%
+\begin{align*}
+\Var[L_n(w)] =
+\frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
+&\leq
+64 C_\rk C_\rH C_\rd \frac{h}{n}.
+\end{align*}
+%
+If conversely $w$ is not a degenerate point
+then
+$\Var[f_{W \mid A}(w \mid A_i)] > 0$
+so eventually
+%
+\begin{align*}
+\Var[L_n(w)]
+= \frac{4}{n}
+\Var\big[\E[k_h(W_{i j},w) \mid A_i]\big]
+&\geq
+\frac{2}{n}
+\Var[f_{W \mid A}(w \mid A_i)].
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_uniform_concentration}]
+
+We establish VC-type properties of function
+classes and apply empirical process theory.
+
+\proofparagraph{establishing VC-type classes}
+
+Consider the following function classes:
+%
+\begin{align*}
+\cF_1
+&=
+\Big\{
+W_{i j} \mapsto
+k_h(W_{i j},w)
+: w \in \cW
+\Big\}, \\
+\cF_2
+&=
+\Big\{
+(A_i, A_j) \mapsto
+\E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
+: w \in \cW
+\Big\}, \\
+\cF_3
+&=
+\Big\{
+A_i \mapsto
+\E\big[ k_h(W_{i j},w) \mid A_i \big]
+: w \in \cW
+\Big\}.
+\end{align*}
+%
+For $\cF_1$, take $0 < \varepsilon \leq \Leb(\cW)$
+and $\cW_\varepsilon$ an $\varepsilon$-cover of $\cW$
+of cardinality at most $\Leb(\cW)/\varepsilon$. As
+%
+\begin{align*}
+\sup_{s, w, w' \in \cW}
+\left|
+\frac{k_h(s,w) - k_h(s,w')}
+{w-w'}
+\right|
+&\leq
+\frac{C_\mathrm{L}}{h^2}
+\end{align*}
+%
+almost surely,
+we see that
+%
+\begin{align*}
+\sup_\Q
+N\left(\cF_1, \rho_\Q,
+\frac{C_\mathrm{L}}{h^2} \varepsilon \right)
+&\leq
+N\left(\cF_1, \|\cdot\|_\infty,
+\frac{C_\mathrm{L}}{h^2} \varepsilon \right)
+\leq
+\frac{\Leb(\cW)}{\varepsilon},
+\end{align*}
+%
+where $\Q$ ranges over Borel
+probability measures on $\cW$.
+Since
+$\frac{C_\rk}{h}$
+is an envelope for $\cF_1$,
+%
+\begin{align*}
+\sup_\Q
+N\left(\cF_1, \rho_\Q,
+\frac{C_\rk}{h} \varepsilon \right)
+&\leq
+\frac{C_\rL}{C_\rk}
+\frac{\Leb(\cW)}{h \varepsilon}.
+\end{align*}
+%
+Thus for all $\varepsilon \in (0,1]$,
+%
+\begin{align*}
+\sup_\Q
+N\left(\cF_1, \rho_\Q,
+\frac{C_\rk}{h} \varepsilon \right)
+&\leq
+\frac{C_\rL}{C_\rk}
+\frac{\Leb(\cW) \vee 1}{h \varepsilon}
+\leq
+(C_1/(h\varepsilon))^{C_2},
+\end{align*}
+%
+where
+$C_1 = \frac{C_\rL}{C_\rk} (\Leb(\cW) \vee 1)$
+and $C_2 = 1$.
+Next, $\cF_2$ forms a smoothly parameterized class of functions
+since for $w,w' \in \cW$ we have
+by the uniform Lipschitz properties of
+$f_{W \mid AA}(\cdot \mid A_i, A_j)$ and
+$k_h(s, \cdot)$,
+with $|w-w'| \leq h$,
+%
+\begin{align*}
+&\left|
+\E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
+- \E\big[ k_h(W_{i j},w') \mid A_i, A_j \big]
+\right| \\
+&\quad=
+\left|
+\int_{[w \pm h] \cap \cW}
+k_h(s,w)
+f_{W \mid AA}(s \mid A_i, A_j)
+\diff{s}
+- \int_{[w' \pm h] \cap \cW}
+k_h(s,w')
+f_{W \mid AA}(s \mid A_i, A_j)
+\diff{s}
+\right| \\
+&\quad=
+\left|
+\int_{[w \pm 2h] \cap \cW}
+\big(
+k_h(s,w)
+- k_h(s,w')
+\big)
+f_{W \mid AA}(s \mid A_i, A_j)
+\diff{s}
+\right| \\
+&\quad=
+\left|
+\int_{[w \pm 2h] \cap \cW}
+\big(
+k_h(s,w)
+- k_h(s,w')
+\big)
+\big(
+f_{W \mid AA}(s \mid A_i, A_j)
+- f_{W \mid AA}(w \mid A_i, A_j)
+\big)
+\diff{s}
+\right| \\
+&\quad\leq
+4h
+\frac{C_\rL}{h^2}
+|w-w'|
+2 C_\rH h
+\leq
+8 C_\rL C_\rH
+|w-w'|
+\leq
+C_3
+|w-w'|,
+\end{align*}
+%
+where $C_3 = 8 C_\rL C_\rH$.
+The same holds for $|w-w'| > h$
+as the Lipschitz property is local.
+By taking $\E[\, \cdot \mid A_i]$,
+it can be seen
+by the contraction property of conditional expectation that
+the same holds for the
+singly-conditioned terms:
+%
+\begin{align*}
+\left|
+\E\big[ k_h(W_{i j},w) \mid A_i \big]
+- \E\big[ k_h(W_{i j},w') \mid A_i \big]
+\right|
+&\leq
+C_3
+|w-w'|.
+\end{align*}
+%
+Therefore $\cF_3$ is also smoothly parameterized
+in exactly the same manner.
+Let
+%
+\begin{align*}
+C_4
+&=
+\sup_{w \in \cW}
+\esssup_{A_i, A_j}
+\big|
+\E\big[ k_h(W_{i j},w) \mid A_i, A_j \big]
+\big| \\
+&=
+\sup_{w \in \cW}
+\esssup_{A_i, A_j}
+\left|
+\int_{[w \pm h] \cap \cW}
+k_h(s,w)
+f_{W \mid AA}(s \mid A_i, A_j)
+\diff{s}
+\right| \\
+&\leq 2h \frac{C_\rk}{h} C_\rd
+\leq 2 C_\rk C_\rd.
+\end{align*}
+%
+For $\varepsilon \in (0,1]$,
+take an $(\varepsilon C_4/C_3)$-cover of $\cW$
+of cardinality at most $C_3 \Leb(\cW) / (\varepsilon C_4)$.
+By the above parameterization properties,
+this cover induces an
+$\varepsilon C_4$-cover for both $\cF_2$ and $\cF_3$:
+%
+\begin{align*}
+\sup_\Q
+N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big)
+&\leq
+N\big(\cF_2, \|\cdot\|_\infty, \varepsilon C_4 \big)
+\leq
+C_3 \Leb(\cW) / (\varepsilon C_4), \\
+\sup_\Q
+N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big)
+&\leq
+N\big(\cF_3, \|\cdot\|_\infty, \varepsilon C_4 \big)
+\leq
+C_3 \Leb(\cW) / (\varepsilon C_4).
+\end{align*}
+%
+Hence $\cF_1$, $\cF_2$, and $\cF_3$
+form VC-type classes with envelopes
+$F_1 = C_\rk / h$ and $F_2 = F_3 = C_4$:
+%
+\begin{align*}
+\sup_\Q
+N\left(\cF_1, \rho_\Q,
+\varepsilon C_\rk / h \right)
+&\leq
+(C_1/(h\varepsilon))^{C_2},
+&\sup_\Q
+N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big)
+&\leq
+(C_1/\varepsilon)^{C_2}, \\
+\sup_\Q
+N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big)
+&\leq
+(C_1/\varepsilon)^{C_2},
+\end{align*}
+%
+for some constants $C_1 \geq e$ and $C_2 \geq 1$,
+where we augment the constants if necessary.
+
+\proofparagraph{controlling $L_n$}
+
+Observe that
+$\sqrt{n}L_n$
+is the empirical process of the i.i.d.\ variables $A_i$
+indexed by $\cF_3$.
+We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid}
+with $\sigma = C_4$:
+%
+\begin{align*}
+\E \left[
+\sup_{w \in \cW}
+\big| \sqrt{n} L_
+n(w) \big|
+\right]
+&\lesssim
+C_4
+\sqrt{C_2 \log C_1}
++
+\frac{C_4 C_2 \log C_1}
+{\sqrt{n}}
+\lesssim 1.
+\end{align*}
+%
+By Lemma~\ref{lem:kernel_trichotomy},
+the left hand side is zero whenever
+$\Du = 0$,
+so we can also write
+%
+\begin{align*}
+\E \left[
+\sup_{w \in \cW}
+\big| \sqrt{n} L_n(w) \big|
+\right]
+&\lesssim
+\Du.
+\end{align*}
+
+\proofparagraph{controlling $Q_n$}
+
+Observe that $n Q_n$
+is the completely degenerate second-order U-process
+of the i.i.d.\ variables $A_i$
+indexed by $\cF_2$.
+This function class is again uniformly bounded and VC-type,
+so applying the U-process maximal inequality from
+Lemma~\ref{lem:kernel_app_uprocess_maximal}
+yields with $\sigma = C_4$
+%
+\begin{align*}
+\E \left[
+\sup_{w \in \cW}
+\big| n Q_n(w) \big|
+\right]
+&\lesssim
+C_4
+C_2 \log C_1
++
+\frac{C_4 (C_2 \log C_1)^2}
+{\sqrt{n}}
+\lesssim 1.
+\end{align*}
+
+\proofparagraph{controlling $E_n$}
+
+Conditional on $\bA_n$,
+note that $n E_n$
+is the empirical process of the conditionally
+i.n.i.d.\ variables $W_{i j}$
+indexed by $\cF_1$.
+We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid}
+conditionally with
+%
+\begin{align*}
+\sigma^2
+&=
+\sup_{w \in \cW}
+\E\Big[
+\big(
+k_h(W_{i j},w)
+- \E[k_h(W_{i j},w) \mid A_i, A_j]
+\big)^2
+\mid A_i, A_j
+\Big]
+\leq
+\sup_{w \in \cW}
+\E\Big[
+k_h(W_{i j},w)^2
+\mid A_i, A_j
+\Big] \\
+&\leq
+\sup_{w \in \cW}
+\int_{[w \pm h] \cap \cW}
+k_h(s,w)^2
+f_{W \mid AA}(s \mid A_i, A_j)
+\diff{s}
+\leq 2h \frac{C_\rk^2}{h^2}
+\lesssim 1/h
+\end{align*}
+%
+and noting that we have
+a sample size of
+$\frac{1}{2}n(n-1)$,
+giving
+%
+\begin{align*}
+\E \left[
+\sup_{w \in \cW}
+\big| n E_n(w) \big|
+\right]
+&\lesssim
+\sigma
+\sqrt{C_2 \log \big((C_1/h) F_1 / \sigma \big)}
++
+\frac{F_1 C_2 \log \big((C_1/h) F_1 / \sigma\big)}
+{n} \\
+&\lesssim
+\frac{1}{\sqrt h}
+\sqrt{C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)}
++
+\frac{(C_\rk/h) C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)}
+{n} \\
+&\lesssim
+\sqrt{\frac{\log 1/h}{h}}
++
+\frac{\log \big(1/h\big)}
+{n h}
+\lesssim
+\sqrt{\frac{\log n}{h}},
+\end{align*}
+%
+where the last line follows by the bandwidth assumption
+of $\frac{\log n}{n^2h} \to 0$.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_uniform_consistency}]
+This follows from Theorem~\ref{thm:kernel_bias}
+and Lemma~\ref{lem:kernel_uniform_concentration}.
+\end{proof}
+
+Before proving Theorem~\ref{thm:kernel_minimax}
+we first give a lower bound result
+for parametric point estimation in
+Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}.
+
+\begin{lemma}[A Neyman--Pearson result for Bernoulli random variables]
+\label{lem:kernel_app_neyman_pearson_bernoulli}
+
+Recall that the Bernoulli distribution
+$\Ber(\theta)$
+places mass $\theta$ at $1$ and mass
+$1-\theta$ at $0$.
+Define $\P_\theta^n$ as the law of
+$(A_1, A_2, \ldots, A_n, V)$,
+where $A_1, \ldots, A_n$
+are i.i.d.\ $\Ber(\theta)$,
+and $V$ is an $\R^d$-valued random variable
+for some $d \geq 1$
+which is independent of the $A$ variables
+and with a fixed distribution that does not depend on $\theta$.
+Let $\theta_0 = \frac{1}{2}$
+and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$.
+Then for any estimator $\tilde \theta_n$
+which is a function of
+$(A_1, A_2, \ldots, A_n, V)$ only,
+%
+\begin{align*}
+\P_{\theta_0}^n \left(
+\big| \tilde \theta_n - \theta_0 \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
++ \P_{\theta_{1,n}}^n \left(
+\big| \tilde \theta_n - \theta_{1,n} \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
+\geq \frac{1}{2}.
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}]
+
+Let $f: \{0,1\}^n \to \{0,1\}$
+be any function.
+Considering this function as a statistical test,
+the Neyman--Pearson lemma and Pinsker's inequality
+\citep{gine2021mathematical}
+give
+%
+\begin{align*}
+\P_{\theta_0}^n \big(
+f=1
+\big)
++\P_{\theta_{1,n}}^n \big(
+f=0
+\big)
+&\geq
+1-
+\TV\left(
+\P_{\theta_0}^n,
+\P_{\theta_{1,n}}^n
+\right)
+\geq
+1-
+\sqrt{
+\frac{1}{2}
+\KL \left(
+\P_{\theta_0}^n
+\bigm\|
+\P_{\theta_{1,n}}^n
+\right)} \\
+&=
+1-
+\sqrt{
+\frac{n}{2}
+\KL \left(
+\Ber(\theta_0)
+\bigm\|
+\Ber(\theta_{1,n})
+\right)
++ \frac{n}{2}
+\KL \left(
+V
+\bigm\|
+V
+\right)} \\
+&=
+1-
+\sqrt{
+\frac{n}{2}
+\KL \left(
+\Ber(\theta_0)
+\bigm\|
+\Ber(\theta_{1,n})
+\right)},
+\end{align*}
+%
+where $\TV$ is the total variation distance
+and $\KL$ is the Kullback--Leibler divergence.
+In the penultimate line
+we used the tensorization of Kullback--Leibler divergence
+\citep{gine2021mathematical},
+noting that the law of $V$ is fixed and hence does not contribute.
+We now evaluate this Kullback--Leibler divergence at the specified
+parameter values.
+%
+\begin{align*}
+\P_{\theta_0}^n \big(
+f=1
+\big)
++\P_{\theta_{1,n}}^n \big(
+f=0
+\big)
+&\geq
+1-
+\sqrt{
+\frac{n}{2}
+\KL \left(
+\Ber(\theta_0)
+\bigm\|
+\Ber(\theta_{1,n})
+\right)} \\
+&=
+1-
+\sqrt{\frac{n}{2}}
+\sqrt{
+\theta_0 \log \frac{\theta_0}{\theta_{1,n}}
++ (1 - \theta_0) \log \frac{1 - \theta_0}{1 - \theta_{1,n}}} \\
+&=
+1-
+\sqrt{\frac{n}{2}}
+\sqrt{
+\frac{1}{2} \log \frac{1/2}{1/2 + 1/\sqrt{8n}}
++ \frac{1}{2} \log \frac{1/2}{1/2 - 1/\sqrt{8n}}} \\
+&=
+1-
+\frac{\sqrt n}{2}
+\sqrt{\log \frac{1}{1 - 1/(2n)}}
+\geq
+1-
+\frac{\sqrt n}{2}
+\sqrt{\frac{1}{n}}
+=
+\frac{1}{2},
+\end{align*}
+%
+where in the penultimate line we used that
+$\log \frac{1}{1-x} \leq 2x$
+for $x \in [0,1/2]$.
+Now define a test $f$ by
+$f = 1$ if $\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}$
+and $f=0$ otherwise,
+to see
+%
+\begin{align*}
+\P_{\theta_0}^n \left(
+\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}
+\right)
++ \P_{\theta_{1,n}}^n \left(
+\tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}}
+\right)
+\geq \frac{1}{2}.
+\end{align*}
+%
+By the triangle inequality,
+recalling that
+$\theta_0 = \frac{1}{2}$
+and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$,
+we have
+%
+\begin{align*}
+\left\{
+\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}
+\right\}
+&\subseteq
+\left\{
+\left| \tilde \theta_n - \theta_0 \right|
+\geq \frac{1}{\sqrt{32n}}
+\right\} \\
+\left\{
+\tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}}
+\right\}
+&\subseteq
+\left\{
+\left| \tilde \theta_n - \theta_{1,n} \right|
+\geq \frac{1}{\sqrt{32n}}
+\right\}.
+\end{align*}
+%
+Thus by the monotonicity of measures,
+%
+\begin{align*}
+\P_{\theta_0}^n \left(
+\big| \tilde \theta_n - \theta_0 \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
++ \P_{\theta_{1,n}}^n \left(
+\big| \tilde \theta_n - \theta_{1,n} \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
+\geq \frac{1}{2}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_minimax}]
+
+\proofparagraph{lower bound for $\cP$}
+
+By translation and scaling of the data,
+we may assume without loss of generality that $\cW = [-1,1]$.
+We may also assume that $C_\rH \leq 1/2$,
+since reducing $C_\rH$ can only shrink the class of distributions.
+Define the dyadic distribution $\P_\theta$
+with parameter $\theta \in [1/2, 1]$
+as follows:
+$A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, while
+$V_{i j}$ for $1 \leq i < j \leq n$ are i.i.d.\
+and independent of $\bA_n$.
+The distribution of $V_{i j}$ is given by its density function
+$f_V(v) = \frac{1}{2} + C_\rH v$ on $[-1,1]$.
+Finally, generate
+$W_{i j} = W(A_i, A_j, V_{i j}) \vcentcolon=
+(2 A_i A_j - 1) V_{i j}$.
+Note that the function $W$ does not depend on $\theta$.
+The conditional and marginal densities of $W_{i j}$ are
+for $w \in [-1,1]$
+%
+\begin{align*}
+f_{W \mid AA}(w \mid A_i, A_j)
+&=
+\begin{cases}
+\frac{1}{2} + C_\rH w & \text{if } A_i = A_j = 1, \\
+\frac{1}{2} - C_\rH w & \text{if } A_i = 0 \text{ or } A_j = 0, \\
+\end{cases} \\
+f_{W \mid A}(w \mid A_i)
+&=
+\begin{cases}
+\frac{1}{2} + (2 \theta - 1) C_\rH w
+& \text{if } A_i = 1, \\
+\frac{1}{2} - C_\rH w & \text{if } A_i = 0 , \\
+\end{cases} \\
+f_W(w)&= \frac{1}{2} + (2\theta^2 - 1) C_\rH w.
+\end{align*}
+%
+Clearly,
+$f_W \in \cH^\beta_{C_\rH}(\cW)$ and
+$f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$.
+Also
+$\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV \leq 1$.
+Therefore
+$\P_\theta$ satisfies Assumption~\ref{ass:kernel_data}
+and so
+$\big\{\P_\theta : \theta \in [1/2, 1] \big\} \subseteq \cP$.
+
+Note that $f_W(1) = \frac{1}{2} + (2\theta^2 - 1) C_\rH $,
+so $\theta^2 = \frac{1}{2 C_\rH}(f_W(1) - 1/2 + C_\rH)$.
+Thus if $\tilde f_W$ is some density estimator
+depending only on the data $\bW_n$,
+we define the parameter estimator
+%
+\begin{align*}
+\tilde \theta_n^2
+&\vcentcolon=
+\frac{1}{2 C_\rH}\left(
+\tilde f_W(1) - \frac{1}{2} + C_\rH
+\right)
+\vee 0.
+\end{align*}
+%
+This gives the inequality
+%
+\begin{align*}
+\big|
+\tilde \theta_n^2 - \theta^2
+\big|
+&=
+\left|
+\frac{1}{2 C_\rH}\left(
+\tilde f_W(1) - \frac{1}{2} + C_\rH
+\right)
+\vee 0
+-
+\frac{1}{2 C_\rH}\left(
+f_W(1) - \frac{1}{2} + C_\rH
+\right)
+\right| \\
+&\leq
+\frac{1}{2 C_\rH}
+\sup_{w \in \cW}
+\left|
+\tilde f_W(w) - f_W(w)
+\right|.
+\end{align*}
+%
+Therefore, since also $\tilde \theta \geq 0$
+and $\theta \geq \frac{1}{2}$,
+%
+\begin{align*}
+\big|
+\tilde \theta_n - \theta
+\big|
+&=
+\frac{\big|\tilde \theta_n^2 - \theta^2\big|}
+{\tilde \theta_n + \theta}
+\leq
+\frac{1}{C_\rH}
+\sup_{w \in \cW}
+\left|
+\tilde f_W(w) - f_W(w)
+\right|.
+\end{align*}
+%
+Now we apply the point estimation lower bound from
+Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli},
+setting $\theta_0 = \frac{1}{2}$
+and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$,
+noting that the estimator
+$\tilde \theta_n$
+is a function of $\bW_n$ only,
+thus is a function of $\bA_n$ and
+$\bV_n$ only and so satisfies the conditions.
+%
+\begin{align*}
+&\P_{\theta_0} \left(
+\sup_{w \in \cW} \big| \tilde f_W(w) - f^{(0)}_W(w) \big|
+\geq \frac{1}{C\sqrt{n}}
+\right)
++ \P_{\theta_{1,n}} \left(
+\sup_{w \in \cW} \big| \tilde f_W(w) - f^{(1)}_W(w) \big|
+\geq \frac{1}{C\sqrt{n}}
+\right) \\
+&\quad\geq
+\P_{\theta_0} \left(
+\big| \tilde \theta_n - \theta_0 \big|
+\geq \frac{1}{C C_\rH \sqrt{n}}
+\right)
++ \P_{\theta_{1,n}} \left(
+\big| \tilde \theta_n - \theta_{1,n} \big|
+\geq \frac{1}{C C_\rH \sqrt{n}}
+\right) \\
+&\quad\geq
+\P_{\theta_0} \left(
+\big| \tilde \theta_n - \theta_0 \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
++ \P_{\theta_{1,n}} \left(
+\big| \tilde \theta_n - \theta_{1,n} \big|
+\geq \frac{1}{\sqrt{32n}}
+\right)
+\geq
+\frac{1}{2},
+\end{align*}
+%
+where we set $C \geq \frac{\sqrt{32}}{C_\rH}$.
+Therefore we deduce that
+%
+\begin{align*}
+\inf_{\tilde f_W}
+\sup_{\P \in \cP}
+\P\left(
+\sup_{w \in \cW}
+\big|
+\tilde f_W(w) - f_W(w)
+\big|
+\geq
+\frac{1}{C \sqrt n}
+\right)
+\geq \frac{1}{4}
+\end{align*}
+%
+and so
+%
+\begin{align*}
+\inf_{\tilde f_W}
+\sup_{\P \in \cP}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|
+\tilde f_W(w) - f_W(w)
+\big|
+\right]
+\geq \frac{1}{4 C \sqrt{n}}.
+\end{align*}
+
+\proofparagraph{lower bound for $\cP_\rd$}
+
+For the subclass of totally degenerate distributions,
+we rely on the main theorem
+from \citet{khasminskii1978lower}.
+Let $\cP_0$ be the subclass of $\cP_\rd$
+consisting of the distributions which satisfy
+$A_1 = \cdots = A_n = 0$
+and $W_{i j} \vcentcolon= A_i + A_j + V_{i j} = V_{i j}$,
+so that $W_{i j}$ are i.i.d.\ with common density $f_W = f_V$.
+Define the class
+%
+\begin{align*}
+\cF
+&=
+\left\{
+f \text{ density function on } \R, \
+f \in \cH^\beta_{C_\rH}(\cW)
+\right\}.
+\end{align*}
+%
+Write $\E_f$ for the expectation under $W_{i j}$ having density $f$.
+Then by \citet{khasminskii1978lower},
+%
+\begin{align*}
+\liminf_{n \to \infty}
+\inf_{\tilde f_W}
+\sup_{f \in \cF}
+\E_f\left[
+\left( \frac{n^2}{\log n} \right)^{\frac{\beta}{2\beta + 1}}
+\sup_{w \in \cW}
+\big| \tilde f_W(w) - f_W(w) \big|
+\right]
+> 0,
+\end{align*}
+%
+where $\tilde f_W$ is any
+density estimator
+depending only on the $\frac{1}{2}n(n-1)$ i.i.d.\ data samples $\bW_n$.
+Now every density function in
+$\cH^\beta_{C_\rH}(\cW)$
+corresponds to a distribution in
+$\cP_0$ and therefore to a distribution in $\cP_\rd$.
+Thus for large enough $n$ and
+some positive constant $C$,
+%
+\begin{align*}
+\inf_{\tilde f_W}
+\sup_{\P \in \cP_\rd}
+\E_\P\left[
+\sup_{w \in \cW}
+\big| \tilde f_W(w) - f_W(w) \big|
+\right]
+\geq
+\frac{1}{C}
+\left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta + 1}}.
+\end{align*}
+
+\proofparagraph{upper bounds}
+
+The upper bounds follow by
+using a dyadic kernel density estimator $\hat f_W$
+with a boundary bias-corrected
+Lipschitz kernel of order $p \geq \beta$ and a bandwidth of $h$.
+Theorem~\ref{thm:kernel_bias} gives
+%
+\begin{align*}
+\sup_{\P \in \cP}
+\sup_{w \in \cW}
+\big|
+\E_\P\big[\hat f_W(w)\big]
+- f_W(w)
+\big|
+\leq
+\frac{4C_\rk C_\rH}{\flbeta !}
+h^\beta.
+\end{align*}
+%
+Then,
+treating the degenerate and non-degenerate cases separately
+and noting that all inequalities hold uniformly over
+$\cP$ and $\cP_\rd$,
+the proof of Lemma~\ref{lem:kernel_uniform_concentration}
+shows that
+%
+\begin{align*}
+\sup_{\P \in \cP}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big|
+\right]
+&\lesssim
+\frac{1}{\sqrt n}
++ \sqrt{\frac{\log n}{n^2h}}, \\
+\sup_{\P \in \cP_\rd}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big|
+\right]
+&\lesssim
+\sqrt{\frac{\log n}{n^2h}}.
+\end{align*}
+%
+Thus combining these yields that
+%
+\begin{align*}
+\sup_{\P \in \cP}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - f_W(w)\big|
+\right]
+&\lesssim
+h^\beta
++ \frac{1}{\sqrt n}
++ \sqrt{\frac{\log n}{n^2h}}, \\
+\sup_{\P \in \cP_\rd}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - f_W(w)\big|
+\right]
+&\lesssim
+h^\beta
++ \sqrt{\frac{\log n}{n^2h}}.
+\end{align*}
+%
+Set $h = \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$
+and note that $\beta \geq 1$ implies that
+$\left(\frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}
+\ll \frac{1}{\sqrt n}$.
+So for $C > 0$,
+%
+\begin{align*}
+\sup_{\P \in \cP}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - f_W(w)\big|
+\right]
+&\lesssim
+\frac{1}{\sqrt n}
++ \left(
+\frac{\log n}{n^2}
+\right)^{\frac{\beta}{2\beta+1}}
+\leq
+\frac{C}{\sqrt n}, \\
+\sup_{\P \in \cP_\rd}
+\E_\P\left[
+\sup_{w \in \cW}
+\big|\hat f_W(w) - f_W(w)\big|
+\right]
+&\leq
+C\left(
+\frac{\log n}{n^2}
+\right)^{\frac{\beta}{2\beta+1}}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_covariance_structure}]
+
+We write $k_{i j}$ for $k_h(W_{i j},w)$
+and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity.
+%
+\begin{align*}
+\Sigma_n(w,w')
+&=
+\E\Big[
+\big(
+\hat f_W(w)
+- \E[\hat f_W(w)]
+\big)
+\big(
+\hat f_W(w')
+- \E[\hat f_W(w')]
+\big)
+\Big] \\
+&=
+\E\left[
+\left(
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\big(
+k_{i j} - \E k_{i j}
+\big)
+\right)
+\left(
+\frac{2}{n(n-1)}
+\sum_{r<s}
+\big(
+k_{rs}' - \E k_{rs}'
+\big)
+\right)
+\right] \\
+&=
+\frac{4}{n^2(n-1)^2}
+\sum_{i<j}
+\sum_{r<s}
+\E\left[
+\big(
+k_{i j} - \E k_{i j}
+\big)
+\big(
+k_{rs}' - \E k_{rs}'
+\big)
+\right] \\
+&=
+\frac{4}{n^2(n-1)^2}
+\sum_{i<j}
+\sum_{r<s}
+\Cov\left[
+k_{i j},
+k_{rs}'
+\right].
+\end{align*}
+%
+Note first that
+for $i,j,r,s$ all distinct,
+$k_{i j}$ is independent of $k_{rs}'$
+and so the covariance is zero.
+By a counting argument,
+it can be seen that
+there are
+$n(n-1)/2$
+summands where
+$|\{i,j,r,s\}| = 2$,
+and
+$n(n-1)(n-2)$
+summands where
+$|\{i,j,r,s\}| = 3$.
+Therefore, since the samples
+are identically distributed,
+the value of the summands
+depends only on the number of distinct indices
+and we have the decomposition
+%
+\begin{align*}
+\Sigma_n(w,w')
+&=
+\frac{4}{n^2(n-1)^2}
+\bigg(
+\frac{n(n-1)}{2}
+\Cov[k_{i j}, k_{i j}']
++ n(n-1)(n-2)
+\Cov[k_{i j}, k_{i r}']
+\bigg) \\
+&=
+\frac{2}{n(n-1)}
+\Cov[k_{i j}, k_{i j}']
++ \frac{4(n-2)}{n(n-1)}
+\Cov[k_{i j}, k_{i r}'],
+\end{align*}
+%
+giving the first representation.
+To obtain the second representation,
+note that since
+$W_{i j}$ and $W_{i r}$
+are independent conditional
+on $A_i$,
+%
+\begin{align*}
+\Cov\big[
+k_{i j}
+k_{i r}'
+\big]
+&=
+\E\big[
+k_{i j}
+k_{i r}'
+\big]
+-
+\E[k_{i j}]
+\E[k_{i r}']
+=
+\E\big[
+\E\big[
+k_{i j}
+k_{i r}'
+\mid A_i
+\big]
+\big]
+-
+\E[k_{i j}]
+\E[k_{i r}'] \\
+&=
+\E\big[
+\E[k_{i j} \mid A_i]
+\E[k_{i r}' \mid A_i]
+\big]
+-
+\E[k_{i j}]
+\E[k_{i r}']
+=
+\Cov\big[
+\E[k_{i j} \mid A_i],
+\E[k_{i r}' \mid A_i]
+\big].
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_variance_bounds}]
+
+By Lemma~\ref{lem:kernel_app_covariance_structure},
+the diagonal elements of $\Sigma_n$ are
+%
+\begin{align*}
+\Sigma_n(w,w)
+&=
+\frac{2}{n(n-1)}
+\Var\big[
+k_h(W_{i j},w)
+\big]
++
+\frac{4(n-2)}{n(n-1)}
+\Var\big[
+\E[k_h(W_{i j},w) \mid A_i]
+\big].
+\end{align*}
+%
+We bound each of the two terms separately.
+Firstly, note that since $k_h$ is bounded by $C_\rk/h$,
+%
+\begin{align*}
+\Var\big[
+k_h(W_{i j},w)
+\big]
+\leq
+\E\big[
+k_h(W_{i j},w)^2
+\big]
+=
+\int_{\cW \cap [w \pm h]}
+k_h(s,w)^2
+f_W(s)
+\diff{s}
+\leq 2 C_\rk^2 / h.
+\end{align*}
+%
+Since $\big|\E[k_h(W_{i j},w)]\big|
+= \big|\int_{[w \pm h] \cap \cW} k_h(s,w) f_W(s) \diff{s}\big|
+\leq 2 C_\rk C_\rd$, Jensen's inequality shows
+%
+\begin{align*}
+\Var\big[
+k_h(W_{i j},w)
+\big]
+&\geq
+\int_{\cW \cap [w \pm h]}
+k_h(s,w)^2
+f_W(s)
+\diff{s}
+- 4 C_\rk^2 C_\rd^2 \\
+&\geq
+\inf_{w \in \cW} f_W(w)
+\frac{1}{2h}
+\left(
+\int_{\cW \cap [w \pm h]}
+k_h(s,w)
+\diff{s}
+\right)^2
+- 4 C_\rk^2 C_\rd^2 \\
+&\geq
+\frac{1}{2h}
+\inf_{w \in \cW} f_W(w)
+- 4 C_\rk^2 C_\rd^2
+\geq
+\frac{1}{4h}
+\inf_{w \in \cW} f_W(w)
+\end{align*}
+%
+for small enough $h$, noting that this is trivial if the infimum is zero.
+For the other term,
+%
+\begin{align*}
+\Var\big[
+\E[k_h(W_{i j},w) \mid A_i]
+\big]
+&\leq
+\Var\big[
+f_{W \mid A}(w \mid A_i)
+\big]
++ 16 C_\rH C_\rk C_\rd h
+\leq
+2 \Du^2
+\end{align*}
+%
+for small enough $h$, by a result from
+the proof of Lemma~\ref{lem:kernel_trichotomy}.
+Also
+%
+\begin{align*}
+\Var\big[
+\E[k_h(W_{i j},w) \mid A_i]
+\big]
+&\geq
+\Var\big[
+f_{W \mid A}(w \mid A_i)
+\big]
+- 16 C_\rH C_\rk C_\rd h
+\geq
+\frac{\Dl^2}{2}
+\end{align*}
+%
+for small enough $h$.
+Combining these four inequalities yields
+that for all large enough $n$,
+%
+\begin{align*}
+&\frac{2}{n(n-1)}
+\frac{1}{4h}
+\inf_{w \in \cW} f_W(w)
++ \frac{4(n-2)}{n(n-1)}
+\frac{\Dl^2}{2}
+\leq
+\inf_{w \in \cW} \Sigma_n(w,w) \\
+&\qquad\leq
+\sup_{w \in \cW} \Sigma_n(w,w)
+\leq
+\frac{2}{n(n-1)}
+\frac{2 C_\rk^2}{h}
++ \frac{4(n-2)}{n(n-1)}
+2 \Du^2,
+\end{align*}
+%
+so that
+%
+\begin{align*}
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}
+\inf_{w \in \cW} f_W(w)
+&\lesssim
+\inf_{w \in \cW} \Sigma_n(w,w)
+\leq
+\sup_{w \in \cW} \Sigma_n(w,w)
+\lesssim
+\frac{\Du^2}{n}
++ \frac{1}{n^2h}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_strong_approx_Ln}]
+
+For the strong approximation,
+apply the KMT corollary from
+Lemma~\ref{lem:kernel_app_kmt_corollary}.
+Define
+%
+\begin{align*}
+k_h^A(a, w) = 2\E[k_h(W_{i j},w) \mid A_i = a],
+\end{align*}
+%
+which are of bounded variation in $a$ uniformly over $w$ since
+%
+\begin{align*}
+&\sup_{w \in \cW} \|k_h^A(\cdot,w)\|_\T
+= 2\sup_{w \in \cW}
+\sup_{m \in \N}
+\sup_{a_0 \leq \cdots \leq a_m}
+\sum_{i=1}^m
+\big|k_h^A(a_i,w) - k_h^A(a_{i-1},w)\big| \\
+&\quad=
+2\sup_{w \in \cW}
+\sup_{m \in \N}
+\sup_{a_0 \leq \cdots \leq a_m}
+\sum_{i=1}^m
+\left|
+\int_{[w \pm h] \cap \cW}
+k_h(s,w)
+\big(
+f_{W \mid A}(s \mid a_i)
+- f_{W \mid A}(s \mid a_{i-1})
+\big)
+\diff{s}
+\right| \\
+&\quad\leq
+2 \sup_{w \in \cW}
+\int_{[w \pm h] \cap \cW}
+|k_h(s,w)|
+\sup_{m \in \N}
+\sup_{a_0 \leq \cdots \leq a_m}
+\sum_{i=1}^m
+\big|
+f_{W \mid A}(s \mid a_i)
+- f_{W \mid A}(s \mid a_{i-1})
+\big|
+\diff{s} \\
+&\quad\leq
+2 \sup_{w \in \cW}
+\int_{[w \pm h] \cap \cW}
+|k_h(s,w)|
+\,
+\big\|
+f_{W \mid A}(w \mid \cdot)
+\big\|_\TV
+\diff{s} \\
+&\quad\leq
+4 C_\rk \sup_{w \in \cW}
+\big\|
+f_{W \mid A}(w \mid \cdot)
+\big\|_\TV
+\lesssim
+\Du,
+\end{align*}
+%
+where the last line is by observing that the total variation
+is zero whenever $\Du = 0$.
+Hence by Lemma~\ref{lem:kernel_app_kmt_corollary}
+there exist (on some probability space)
+$n$ independent copies of $A_i$,
+denoted $A_i'$
+and a centered Gaussian process $Z_n^{L\prime}$
+such that if we define
+%
+\begin{align*}
+L_n'(w)
+&=
+\frac{1}{n}
+\sum_{i=1}^n
+\big(k_h^A(A_i',w) -
+\E[k_h^A(A_i',w)]\big),
+\end{align*}
+%
+then for positive constants
+$C_1, C_2, C_3$,
+by defining the processes as zero outside $\cW$
+we have
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big|
+> \Du \frac{t + C_1 \log n}{\sqrt n}
+\right)
+\leq C_2 e^{-C_3 t}.
+\end{align*}
+%
+Integrating tail probabilities shows that
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big|
+\right]
+&\leq
+\Du \frac{C_1 \log n}{\sqrt n}
++ \int_0^\infty
+\frac{\Du}{\sqrt n}
+C_2 e^{-C_3 t}
+\diff{t}
+\lesssim
+\frac{\Du \log n}{\sqrt n}.
+\end{align*}
+%
+Further,
+$Z_n^{L\prime}$ has the
+same covariance structure as $G_n^{L\prime}$ in the
+sense that for all $w, w' \in \cW$,
+%
+\begin{align*}
+\E\big[Z_n^{L\prime}(w) Z_n^{L\prime}(w')\big]
+= \E\big[G_n^{L\prime}(w) G_n^{L\prime}(w')\big],
+\end{align*}
+%
+and clearly $L_n'$
+is equal in distribution to $L_n$.
+To obtain the trajectory regularity property of
+$Z_n^{L\prime}$,
+note that it was shown in the proof of
+Lemma~\ref{lem:kernel_uniform_concentration}
+that for all $w,w' \in \cW$,
+%
+\begin{align*}
+\left|
+k_h^A(A_i,w)
+- k_h^A(A_i,w')
+\right|
+&\leq
+C
+|w-w'|
+\end{align*}
+%
+for some constant $C > 0$.
+Therefore, since the $A_i$ are i.i.d.,
+%
+\begin{align*}
+&\E\left[
+\big|
+Z_n^{L\prime}(w)
+- Z_n^{L\prime}(w')
+\big|^2
+\right]^{1/2}
+=
+\sqrt{n}
+\E\left[
+\big|
+L_n(w)
+- L_n(w')
+\big|^2
+\right]^{1/2} \\
+&\quad=
+\sqrt{n}
+\E\left[
+\left|
+\frac{1}{n}
+\sum_{i=1}^n
+\Big(
+k_h^A(A_i,w)
+- k_h^A(A_i,w')
+- \E\big[k_h^A(A_i,w)]
++ \E\big[k_h^A(A_i,w')]
+\Big)
+\right|^2
+\right]^{1/2} \\
+&\quad=
+\E\left[
+\Big|
+k_h^A(A_i,w)
+- k_h^A(A_i,w')
+- \E\big[k_h^A(A_i,w)]
++ \E\big[k_h^A(A_i,w')]
+\Big|^2
+\right]^{1/2}
+\lesssim
+|w-w'|.
+\end{align*}
+%
+Therefore, by
+the regularity result for Gaussian processes in
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+with $\delta_n \in (0, 1/2]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|
+Z_n^{L\prime}(w)
+- Z_n^{L\prime}(w')
+\big|
+\right]
+&\lesssim
+\int_0^{\delta_n}
+\sqrt{\log 1/\varepsilon}
+\diff{\varepsilon}
+\lesssim
+\delta_n \sqrt{\log 1/\delta_n}
+\lesssim
+\Du
+\delta_n \sqrt{\log 1/\delta_n},
+\end{align*}
+%
+where the last inequality is because
+$Z_n^{L\prime} \equiv 0$ whenever $\Du = 0$.
+There is a modification of $Z_n^{L\prime}$
+with continuous trajectories
+by Kolmogorov's continuity criterion
+\citep[Theorem~2.9]{legall2016brownian}.
+Note that $L_n'$ is $\bA_n'$-measurable
+and so by Lemma~\ref{lem:kernel_app_kmt_corollary}
+we can assume that $Z_n^{L\prime}$
+depends only on $\bA_n'$ and some
+random noise which is independent of
+$(\bA_n', \bV_n')$.
+Finally, in order to have
+$\bA_n', \bV_n', L_n'$, and $Z_n^{L\prime}$
+all defined on the same probability space,
+we note that $\bA_n$ and $\bV_n$ are random vectors
+while $L_n'$ and $Z_n^{L\prime}$
+are stochastic processes
+with continuous sample paths
+indexed on
+the compact interval $\cW$.
+Hence the Vorob'ev--Berkes--Philipp theorem
+(Lemma~\ref{lem:kernel_app_vbp})
+allows us to ``glue'' them together
+in the desired way
+on another new probability space, giving
+$\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$,
+retaining the single prime notation for clarity.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_strong_approx_Ln}]
+See Lemma~\ref{lem:kernel_app_strong_approx_Ln}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}]
+
+We apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditional on
+$\bA_n$. While this lemma is not in its current form
+stated for conditional distributions,
+the Yurinskii coupling on which it depends can be readily extended
+by following the proof of \citet[Lemma~38]{belloni2019conditional},
+using a conditional version of Strassen's theorem
+\cite[Theorem~B.2]{chen2020jackknife}.
+Care must similarly be taken in embedding the conditionally Gaussian vectors
+into a conditionally Gaussian process, using the
+Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}).
+
+By the mutual independence of $A_i$ and $V_{i j}$,
+we have that the observations
+$W_{i j}$ are independent
+(but not necessarily identically distributed)
+conditionally on $\bA_n$.
+Note that
+$\sup_{s,w \in \cW} |k_h(s,w)| \lesssim M_n = h^{-1}$
+and
+$\E[k_h(W_{i j},w)^2 \mid \bA_n] \lesssim \sigma_n^2 = h^{-1}$.
+The following uniform Lipschitz condition holds
+with $l_{n,\infty} = C_\rL h^{-2}$,
+by the Lipschitz property of the kernels:
+%
+\begin{align*}
+\sup_{s,w,w' \in \cW}
+\left|
+\frac{k_h(s, w) - k_h(s, w')}
+{w-w'}
+\right|
+\leq
+l_{n,\infty}.
+\end{align*}
+%
+Also, the following $L^2$ Lipschitz condition holds
+uniformly with $l_{n,2} = 2 C_\rL \sqrt{C_\rd} h^{-3/2}$:
+%
+\begin{align*}
+&\E\big[
+\big|
+k_h(W_{i j}, w) - k_h(W_{i j}, w')
+\big|^2
+\mid \bA_n
+\big]^{1/2} \\
+&\quad\leq
+\frac{C_\rL}{h^2}
+|w-w'|
+\left(
+\int_{([w \pm h] \cup [w' \pm h]) \cap \cW}
+f_{W \mid AA}(s \mid \bA_n)
+\diff{s}
+\right)^{1/2} \\
+&\quad\leq
+\frac{C_\rL}{h^2}
+|w-w'|
+\sqrt{4h C_\rd}
+\leq
+l_{n,2}
+|w-w'|.
+\end{align*}
+%
+So we apply
+Lemma~\ref{lem:kernel_app_yurinskii_corollary}
+conditionally on $\bA_n$
+to the $\frac{1}{2}n(n-1)$ observations,
+noting that
+%
+\begin{align*}
+\sqrt{n^2h} E_n(w)
+=
+\sqrt{\frac{2 n h}{n-1}}
+\sqrt{\frac{2}{n(n-1)}}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+\Big(
+k_h(W_{i j},w)
+- \E[k_h(W_{i j},w) \mid A_i, A_j]
+\Big),
+\end{align*}
+%
+to deduce that for $t_n > 0$ there exist
+(an enlarged probability space)
+conditionally mean-zero
+and conditionally Gaussian processes
+$\tilde Z_n^{E\prime}(w)$
+with the same conditional covariance structure as
+$\sqrt{n^2 h} E_n(w)$ and
+satisfying
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\big|
+\sqrt{n^2h} E_n(w) - \tilde Z_n^{E\prime}(w)
+\big|
+> t_n
+\Bigm\vert \bA_n'
+\right) \\
+&\quad=
+\P\left(
+\sup_{w \in \cW}
+\left|
+\sqrt{\frac{n(n-1)}{2}} E_n(w)
+- \sqrt{\frac{n-1}{2 n h}} \tilde Z_n^{E\prime}(w)
+\right|
+> \sqrt{\frac{n-1}{2 n h}}
+t_n
+\Bigm\vert \bA_n'
+\right) \\
+&\quad\lesssim
+\frac{
+\sigma_n
+\sqrt{\Leb(\cW)}
+\sqrt{\log n}
+\sqrt{M_n + \sigma_n\sqrt{\log n}}
+}{n^{1/2} t_n^2 / h}
+\sqrt{
+l_{n,2}
+\sqrt{\log n}
++ \frac{l_{n,\infty}}{n}
+\log n} \\
+&\quad\lesssim
+\frac{
+h^{-1/2}
+\sqrt{\log n}
+\sqrt{h^{-1} + h^{-1/2} \sqrt{\log n}}
+}{n^{1/2} t_n^2 / h}
+\sqrt{
+h^{-3/2}
+\sqrt{\log n}
++ \frac{h^{-2}}{n}
+\log n} \\
+&\quad\lesssim
+\sqrt{\frac{\log n}{n}}
+\frac{
+\sqrt{1 + \sqrt{h \log n}}
+}{t_n^2}
+\sqrt{
+\sqrt{\frac{\log n}{h^3}}
+\left( 1 + \sqrt{\frac{\log n}{n^2 h}} \right)
+} \\
+&\quad\lesssim
+\sqrt{\frac{\log n}{n}}
+\frac{ 1 }{t_n^2}
+\left(
+\frac{\log n}{h^3}
+\right)^{1/4}
+\lesssim
+t_n^{-2}
+n^{-1/2}
+h^{-3/4}
+(\log n)^{3/4},
+\end{align*}
+%
+where we used
+$h \lesssim 1 / \log n$
+and $\frac{\log n}{n^2 h} \lesssim 1$.
+To obtain the trajectory regularity property of
+$\tilde Z_n^{E\prime}$,
+note that
+for $w, w' \in \cW$,
+by conditional independence,
+%
+\begin{align*}
+&\E\left[
+\big|
+\tilde Z_n^{E\prime}(w)
+- \tilde Z_n^{E\prime}(w')
+\big|^2
+\mid \bA_n'
+\right]^{1/2}
+=
+\sqrt{n^2h} \,
+\E\left[
+\big|
+E_n(w)
+- E_n(w')
+\big|^2
+\mid \bA_n
+\right]^{1/2} \\
+&\quad\lesssim
+\sqrt{n^2h} \,
+\E\left[
+\left|
+\frac{2}{n(n-1)}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+\Big(
+k_h(W_{i j},w)
+- k_h(W_{i j},w')
+\Big)
+\right|^2
+\Bigm\vert \bA_n
+\right]^{1/2} \\
+&\quad\lesssim
+\sqrt{h} \,
+\E\left[
+\big|
+k_h(W_{i j},w)
+- k_h(W_{i j},w')
+\big|^2
+\bigm\vert \bA_n
+\right]^{1/2}
+\lesssim
+h^{-1} |w-w'|.
+\end{align*}
+%
+So by the regularity result for Gaussian processes in
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+with $\delta_n \in (0, 1/(2h)]$:
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|
+\tilde Z_n^{E\prime}(w)
+- \tilde Z_n^{E\prime}(w')
+\big|
+\mid \bA_n'
+\right]
+&\lesssim
+\int_0^{\delta_n/h}
+\sqrt{\log (\varepsilon^{-1} h^{-1})}
+\diff{\varepsilon}
+\lesssim
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h\delta_n}},
+\end{align*}
+%
+and there exists a modification with continuous trajectories.
+Finally, in order to have $\bA_n', \bV_n', E_n'$, and $\tilde Z_n^{E\prime}$
+all defined on the same probability space,
+we note that $\bA_n$ and $\bV_n$ are random vectors
+while $E_n'$ and $\tilde Z_n^{E\prime}$ are stochastic processes
+with continuous sample paths indexed on the compact interval $\cW$.
+Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp})
+allows us to ``glue together'' $\big(\bA_n, \bV_n, E_n\big)$
+and $\big(E_n', \tilde Z_n^{E\prime}\big)$
+in the desired way on another new probability space, giving
+$\big(\bA_n', \bV_n', E_n', \tilde Z_n^{E\prime}\big)$,
+retaining the single prime notation for clarity.
+
+The trajectories of the conditionally Gaussian processes
+$\tilde Z_n^{E\prime}$ depend on the choice of $t_n$,
+necessitating the use of a divergent sequence $R_n$ to establish
+bounds in probability.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_conditional_strong_approx_En}]
+See Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}]
+
+\proofparagraph{defining $Z_n^{E\dprime}$}
+
+Pick $\delta_n \to 0$
+with $\log 1/\delta_n \lesssim \log n$.
+Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
+with cardinality $\Leb(\cW)/\delta_n$
+which is also a $\delta_n$-packing.
+Let $\tilde Z_{n,\delta}^{E\prime}$
+be the restriction of $\tilde Z_n^{E\prime}$
+to $\cW_\delta$.
+Let
+$\tilde \Sigma_n^E(w, w') =
+\E\big[\tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w')
+\mid \bA_n' \big]$
+be the conditional covariance function of $\tilde Z_n^{E\prime}$,
+and define
+$\Sigma_n^E(w,w') = \E\big[\tilde \Sigma_n^E(w,w')\big]$.
+Let $\tilde \Sigma^E_{n,\delta}$ and $\Sigma^E_{n,\delta}$
+be the restriction matrices of
+$\tilde \Sigma^E_n$ and $\Sigma^E_n$
+to $\cW_\delta \times \cW_\delta$,
+noting that, as (conditional) covariance matrices,
+these are
+(almost surely)
+positive semi-definite.
+
+Let $N \sim \cN(0, I_{|\cW_\delta|})$
+be independent of $\bA_n'$,
+and define using the matrix square root
+$\tilde Z_{n,\delta}^{E\dprime}
+= \big(\tilde \Sigma^E_{n,\delta})^{1/2} N$,
+which has the same distribution as
+$\tilde Z_{n,\delta}^{E\prime}$,
+conditional on $\bA_n'$.
+Extend it using
+the Vorob'ev--Berkes--Philipp theorem
+(Lemma~\ref{lem:kernel_app_vbp})
+to the compact interval $\cW$,
+giving a conditionally Gaussian process
+$\tilde Z_n^{E\dprime}$
+which has the same distribution as
+$\tilde Z_{n}^{E\prime}$,
+conditional on $\bA_n'$.
+Define
+$Z_{n,\delta}^{E\dprime} = \big(\Sigma^E_{n,\delta})^{1/2} N$,
+noting that this is independent of $\bA_n'$,
+and extend it using
+the Vorob'ev--Berkes--Philipp theorem
+(Lemma~\ref{lem:kernel_app_vbp})
+to a Gaussian process
+$Z_n^{E\dprime}$ on the compact interval $\cW$,
+which is independent of $\bA_n'$
+and has covariance structure given by
+$\Sigma_n^E$.
+
+\proofparagraph{closeness of $Z_n^{E\dprime}$ and
+$\tilde Z_n^{E\dprime}$ on the mesh}
+
+Note that conditionally on $\bA_n'$,
+$\tilde Z_{n,\delta}^{E\dprime} - Z_{n,\delta}^{E\dprime}$
+is a length-$|\cW_\delta|$
+Gaussian random vector with covariance matrix
+$\big(
+\big(\tilde \Sigma^E_{n,\delta}\big)^{1/2}
+- \big(\Sigma^E_{n,\delta}\big)^{1/2}
+\big)^2$.
+So by the Gaussian maximal inequality in
+Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}
+applied conditionally on $\bA_n'$,
+%
+\begin{align*}
+\E\left[
+\max_{w \in \cW_\delta}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\Bigm| \bA_n'
+\right]
+&\lesssim
+\sqrt{\log n}
+\left\|
+\tilde\Sigma^E_{n,\delta}
+- \Sigma^E_{n,\delta}
+\right\|_2^{1/2},
+\end{align*}
+%
+since $\log |\cW_\delta| \lesssim \log n$.
+Next, we apply some U-statistic theory to
+$\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}$,
+with the aim of applying the
+matrix concentration result
+for second-order U-statistics
+presented in Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}.
+Firstly, we note that
+since
+the conditional covariance structures of
+$\tilde Z_n^{E\prime}$ and $\sqrt{n^2h} E_n$
+are equal in distribution,
+we have,
+writing $E_n(\cW_\delta)$
+for the vector $\big(E_n(w) : w \in \cW_\delta\big)$
+and similarly for $k_h(W_{i j}, \cW_\delta)$,
+%
+\begin{align*}
+\tilde\Sigma^E_{n,\delta}
+&=
+n^2h \E[E_n(\cW_\delta) E_n(\cW_\delta)^\T \mid \bA_n] \\
+&=
+n^2h
+\frac{4}{n^2(n-1)^2}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+\E\left[
+\Big(
+k_h(W_{i j}, \cW_\delta)
+- \E\left[
+k_h(W_{i j}, \cW_\delta)
+\mid \bA_n
+\right]
+\Big)
+\right. \\
+&\qquad\left.
+\times\Big(
+k_h(W_{i j}, \cW_\delta)
+- \E\left[
+k_h(W_{i j}, \cW_\delta)
+\mid \bA_n
+\right]
+\Big)^\T
+\bigm\vert \bA_n
+\right] \\
+&=
+\frac{4h}{(n-1)^2}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+u(A_i, A_j),
+\end{align*}
+%
+where we
+define the random
+$|\cW_\delta| \times |\cW_\delta|$
+matrices
+%
+\begin{align*}
+u(A_i, A_j)
+&=
+\E\!\left[
+k_h(W_{i j}, \cW_\delta)
+k_h(W_{i j}, \cW_\delta)^\T
+\mid \bA_n
+\right]
+-
+\E\!\left[
+k_h(W_{i j}, \cW_\delta)
+\mid \bA_n
+\right]
+\E\!\left[
+k_h(W_{i j}, \cW_\delta)
+\mid \bA_n
+\right]^\T.
+\end{align*}
+%
+Let $u(A_i) = \E[u(A_i, A_j) \mid A_i]$ and
+$u = \E[u(A_i, A_j)]$.
+The decomposition
+$\tilde \Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
+= \tilde L +\tilde Q$
+holds by Lemma~\ref{lem:kernel_app_general_hoeffding}, where
+%
+\begin{align*}
+\tilde L
+&=
+\frac{4h}{n-1}
+\sum_{i=1}^n
+\big(
+u(A_i) - u
+\big),
+&\tilde Q
+&=
+\frac{4h}{(n-1)^2}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^{n}
+\big(
+u(A_i, A_j) - u(A_i) - u(A_j) + u
+\big).
+\end{align*}
+%
+Next, we seek an almost sure upper bound on
+$\|u(A_i, A_j)\|_2$.
+Since this is a symmetric matrix,
+we have by H{\"o}lder's inequality
+%
+\begin{align*}
+\|u(A_i, A_j)\|_2
+&\leq
+\|u(A_i, A_j)\|_1^{1/2}
+\|u(A_i, A_j)\|_\infty^{1/2}
+=
+\max_{1 \leq k \leq |\cW_\delta|}
+\sum_{l=1}^{|\cW_\delta|}
+|u(A_i, A_j)_{kl}|.
+\end{align*}
+%
+The terms on the right hand side can be bounded as follows,
+writing $w, w'$ for the $k$th and $l$th
+points in $\cW_\delta$ respectively:
+%
+\begin{align*}
+|u(A_i, A_j)_{kl}|
+&=
+\big|
+\E\left[
+k_h(W_{i j}, w)
+k_h(W_{i j}, w')
+\mid \bA_n
+\right]
+-
+\E\left[
+k_h(W_{i j}, w)
+\mid \bA_n
+\right]
+\E\left[
+k_h(W_{i j}, w')
+\mid \bA_n
+\right]
+\big| \\
+&\lesssim
+\E\left[
+|
+k_h(W_{i j}, w)
+k_h(W_{i j}, w')
+|
+\mid \bA_n
+\right]
++
+\E\left[
+|
+k_h(W_{i j}, w)
+|
+\mid \bA_n
+\right]
+\E\left[
+|
+k_h(W_{i j}, w')
+|
+\mid \bA_n
+\right] \\
+&\lesssim
+h^{-1}
+\I\big\{ |w-w'| \leq 2h \big\}
++ 1
+\lesssim
+h^{-1}
+\I\big\{ |k-l| \leq 2h/\delta_n \big\}
++ 1,
+\end{align*}
+%
+where we used that
+$|w-w'| \geq |k-l| \delta_n$
+because $\cW_\delta$
+is a $\delta_n$-packing.
+Hence
+%
+\begin{align*}
+\|u(A_i, A_j)\|_2
+&\leq
+\max_{1 \leq k \leq |\cW_\delta|}
+\sum_{l=1}^{|\cW_\delta|}
+|u(A_i, A_j)_{kl}|
+\lesssim
+\max_{1 \leq k \leq |\cW_\delta|}
+\sum_{l=1}^{|\cW_\delta|}
+\Big(
+h^{-1}
+\I\big\{ |k-l| \leq 2h/\delta_n \big\}
++ 1
+\Big) \\
+&\lesssim
+1/\delta_n
++ 1/h
++ |\cW_\delta|
+\lesssim
+1/\delta_n
++ 1/h.
+\end{align*}
+%
+Clearly, the same bound holds for
+$\|u(A_i)\|_2$ and $\|u\|_2$, by Jensen's inequality.
+Therefore, applying the matrix Bernstein inequality
+(Lemma~\ref{lem:kernel_app_matrix_bernstein})
+to the zero-mean matrix $\tilde L$ gives
+%
+\begin{align*}
+\E\left[
+\left\|
+\tilde L
+\right\|_2
+\right]
+&\lesssim
+\frac{h}{n}
+\left(\frac{1}{\delta_n} + \frac{1}{h} \right)
+\left(
+\log |\cW_\delta| + \sqrt{n \log |\cW_\delta|}
+\right)
+\lesssim
+\left(\frac{h}{\delta_n} + 1 \right)
+\sqrt{\frac{\log n}{n}}.
+\end{align*}
+%
+The matrix U-statistic concentration inequality
+(Lemma~\ref{lem:kernel_app_ustat_matrix_concentration})
+with $\tilde Q$ gives
+%
+\begin{align*}
+\E\left[
+\big\|
+\tilde Q
+\big\|_2
+\right]
+&\lesssim
+\frac{h}{n^2}
+n
+\left(\frac{1}{\delta_n} + \frac{1}{h} \right)
+\left(
+\log |\cW_\delta|
+\right)^{3/2}
+\lesssim
+\left(\frac{h}{\delta_n} + 1 \right)
+\frac{(\log n)^{3/2}}{n}.
+\end{align*}
+%
+Hence taking a marginal expectation
+and applying Jensen's inequality,
+%
+\begin{align*}
+&\E\left[
+\max_{w \in \cW_\delta}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\right] \\
+&\quad\lesssim
+\sqrt{\log n} \
+\E\left[
+\left\|
+\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
+\right\|_2^{1/2}
+\right]
+\lesssim
+\sqrt{\log n} \
+\E\left[
+\left\|
+\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}
+\right\|_2
+\right]^{1/2} \\
+&\quad\lesssim
+\sqrt{\log n} \
+\E\left[
+\left\|
+\tilde L
++ \tilde Q
+\right\|_2
+\right]^{1/2}
+\lesssim
+\sqrt{\log n} \
+\E\left[
+\left\|
+\tilde L
+\right\|_2
++ \left\|
+\tilde Q
+\right\|_2
+\right]^{1/2} \\
+&\quad\lesssim
+\sqrt{\log n}
+\left(
+\left(\frac{h}{\delta_n} + 1 \right)
+\sqrt{\frac{\log n}{n}}
++ \left(\frac{h}{\delta_n} + 1 \right)
+\frac{(\log n)^{3/2}}{n}
+\right)^{1/2} \\
+&\quad\lesssim
+\sqrt{\frac{h}{\delta_n} + 1}
+\frac{(\log n)^{3/4}}{n^{1/4}}.
+\end{align*}
+
+\proofparagraph{regularity of $Z_n^E$ and $\tilde Z_n^{E\prime}$}
+
+Define the semimetrics
+%
+\begin{align*}
+\rho(w, w')^2
+&=
+\E\left[
+\big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|^2
+\right],
+&\tilde\rho(w, w')^2
+&=
+\E\left[
+\big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2
+\mid \bA_n
+\right].
+\end{align*}
+%
+We bound $\tilde \rho$ as follows,
+since $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$
+have the same conditional covariance structure:
+%
+\begin{align*}
+\tilde\rho(w, w')
+&=
+\E\left[
+\big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2
+\mid \bA_n'
+\right]^{1/2} \\
+&=
+\sqrt{n^2 h} \,
+\E\left[
+\big|E_n(w) - E_n(w')\big|^2
+\mid \bA_n'
+\right]^{1/2}
+\lesssim
+h^{-1}
+|w-w'|,
+\end{align*}
+%
+uniformly in $\bA_n'$,
+where the last line was shown in
+the proof of Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}.
+Note that also
+%
+\begin{align*}
+\rho(w, w')
+&=
+\sqrt{\E[\tilde \rho(w,w')^2]}
+\lesssim
+h^{-1}
+|w-w'|.
+\end{align*}
+%
+Thus Lemma~\ref{lem:kernel_app_gaussian_process_maximal}
+applies directly to $Z_n^E$
+and conditionally to $\tilde Z_n^{E\prime}$,
+with $\delta_n \in (0, 1/(2h)]$,
+demonstrating that
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|
+\bigm\vert \bA_n'
+\right]
+&\lesssim
+\int_0^{\delta_n / h}
+\sqrt{\log (1 / (\varepsilon h))}
+\diff{\varepsilon}
+\lesssim
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h \delta_n}}, \\
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')|
+\right]
+&\lesssim
+\int_0^{\delta_n / h}
+\sqrt{\log (1 / (\varepsilon h))}
+\diff{\varepsilon}
+\lesssim
+\frac{\delta_n}{h}
+\sqrt{\log \frac{1}{h \delta_n}}.
+\end{align*}
+%
+Continuity of trajectories follows from this.
+
+\proofparagraph{conclusion}
+
+We use the previous parts to deduce that
+%
+\begin{align*}
+&\E\left[
+\sup_{w \in \cW}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\right] \\
+&\quad\lesssim
+\E\left[
+\max_{w \in \cW_\delta}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\right] \\
+&\qquad+
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\left\{
+\big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|
++ \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|
+\right\}
+\right] \\
+&\quad\lesssim
+\sqrt{\frac{h}{\delta_n} + 1}
+\frac{(\log n)^{3/4}}{n^{1/4}}
++ \frac{\delta_n \sqrt{\log n}}{h}.
+\end{align*}
+%
+Setting
+$\delta_n = h \left( \frac{\log n}{n} \right)^{1/6}$
+gives
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big|
+\right]
+&\lesssim
+n^{-1/6} (\log n)^{2/3}.
+\end{align*}
+%
+Independence of $Z_n^{E\dprime}$ and $\bA_n''$
+follows by applying the
+Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}),
+conditionally on $\bA_n'$, to the variables
+$\big(\bA_n', \tilde Z_n^{E\prime}\big)$ and
+$\big(\tilde Z_n^{E\dprime}, Z_n^{E\dprime}\big)$.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_unconditional_strong_approx_En}]
+See Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_app_strong_approx_fW}]
+
+We add together the strong approximations
+for the $L_n$ and $E_n$ terms,
+and then add an independent Gaussian process
+to account for the variance of $Q_n$.
+
+\proofparagraph{gluing together the strong approximations}
+
+Let $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$
+be the strong approximation for $L_n$
+derived in Lemma~\ref{lem:kernel_app_strong_approx_Ln}.
+Let $\big(\bA_n'', \bV_n'', E_n'', \tilde Z_n^{E\dprime}\big)$
+and
+$\big(\bA_n''', \bV_n''', \tilde Z_n^{E\tprime}, Z_n^{E\tprime}\big)$
+be the conditional and unconditional strong approximations for $E_n$
+given in Lemmas~\ref{lem:kernel_app_conditional_strong_approx_En}
+and \ref{lem:kernel_app_unconditional_strong_approx_En}
+respectively.
+The first step is to define copies of these variables
+and processes on the same probability space.
+This is achieved by applying the
+Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}).
+Dropping the prime notation for clarity, we construct
+$\big(\bA_n, \bV_n, L_n, Z_n^L, E_n, \tilde Z_n^E, Z_n^E\big)$
+with the following properties:
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+$\sup_{w \in \cW}
+\big| \sqrt{n} L_n(w) - Z_n^L(w)\big|
+\lesssim_\P n^{-1/2} \log n$,
+
+\item
+$\sup_{w \in \cW}
+\big|\sqrt{n^2h} E_n(w) - \tilde Z^E_n(w) \big|
+\lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$,
+
+\item
+$\sup_{w \in \cW}
+\big| \tilde Z^E_n(w) - Z^E_n(w) \big|
+\lesssim_\P n^{-1/6} (\log n)^{2/3}$,
+
+\item
+$Z_n^L$ is independent of $Z_n^E$.
+
+\end{enumerate}
+%
+Note that the independence of
+$Z_n^L$ and $Z_n^E$
+follows since $Z_n^L$
+depends only on $\bA_n$ and some independent random noise,
+while $Z_n^E$ is independent of $\bA_n$.
+Therefore $(Z_n^L, Z_n^E)$ are jointly Gaussian.
+To get the strong approximation result
+for $\hat f_W$,
+define the Gaussian process
+%
+\begin{align*}
+Z_n^f(w)
+&=
+\frac{1}{\sqrt n} Z_n^L(w)
++ \frac{1}{n} Z_n^Q(w)
++ \frac{1}{\sqrt{n^2h}} Z_n^E(w),
+\end{align*}
+%
+where $Z_n^Q(w)$ is a mean-zero Gaussian process
+independent of everything else
+with covariance
+%
+\begin{align*}
+\E\big[
+Z_n^Q(w)
+Z_n^Q(w')
+\big]
+&=
+n^2 \E\big[
+Q_n(w)
+Q_n(w')
+\big].
+\end{align*}
+%
+As shown in the proof of
+Lemma~\ref{lem:kernel_uniform_concentration},
+the process
+$Q_n(w)$ is uniformly Lipschitz
+and uniformly bounded in $w$.
+Thus by Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+we have
+$\E\big[\sup_{w \in \cW}
+|Z_n^Q(w)|\big]
+\lesssim 1$.
+Therefore the uniform approximation error is given by
+%
+\begin{align*}
+&
+\sup_{w \in \cW}
+\big|
+\hat f_W(w) - \E[\hat f_W(w)]
+- Z_n^f(w)
+\big|
+\\
+&\quad=
+\sup_{w \in \cW}
+\left|
+\frac{1}{\sqrt n} Z_n^L(w)
++ \frac{1}{n} Z_n^Q(w)
++ \frac{1}{\sqrt{n^2h}} Z_n^E(w)
+- \Big(
+L_n(w) + Q_n(w) + E_n(w)
+\Big)
+\right| \\
+&\quad\leq
+\sup_{w \in \cW}
+\bigg(
+\frac{1}{\sqrt n}
+\left|
+Z_n^L(w) - \sqrt{n} L_n(w)
+\right|
++ \frac{1}{\sqrt{n^2h}}
+\left|
+\tilde Z_n^E(w) - \sqrt{n^2h} E_n(w)
+\right| \\
+&\qquad+
+\frac{1}{\sqrt{n^2h}}
+\left|
+Z_n^E(w) - \tilde Z_n^E(w)
+\right|
+\big| Q_n(w) \big|
++ \frac{1}{n}
+\big| Z_n^Q(w) \big|
+\bigg) \\
+&\quad\lesssim_\P
+n^{-1} \log n
++ n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-7/6} h^{-1/2} (\log n)^{2/3}.
+\end{align*}
+
+\proofparagraph{covariance structure}
+
+Since $L_n$, $Q_n$, and $E_n$
+are mutually orthogonal in $L^2$
+(as shown in Lemma~\ref{lem:kernel_hoeffding}),
+we have the following covariance
+structure:
+%
+\begin{align*}
+\E\big[Z_n^f(w) Z_n^f(w')\big]
+&=
+\frac{1}{n} \E\big[ Z_n^L(w) Z_n^L(w') \big]
++ \frac{1}{n^2} \E\big[ Z_n^Q(w) Z_n^Q(w') \big]
++ \frac{1}{n^2h} \E\big[ Z_n^E(w) Z_n^E(w') \big] \\
+&=
+\E\big[ L_n(w) L_n(w') \big]
++ \E\big[ Q_n(w) Q_n(w') \big]
++ \E\big[ E_n(w) E_n(w') \big] \\
+&=
+\E\big[
+\big(\hat f_W(w) - \E[\hat f_W(w)]\big)
+\big(\hat f_W(w') - \E[\hat f_W(w')]\big)
+\big].
+\end{align*}
+
+\proofparagraph{trajectory regularity}
+
+The trajectory regularity of the process
+$Z_n^f$ follows directly by adding the regularities
+of the processes $\frac{1}{\sqrt n} Z_n^L$,
+$\frac{1}{n} Z_n^Q$, and $\frac{1}{\sqrt{n^2h}} Z_n^E$.
+Similarly, $Z_n^f$ has continuous trajectories.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_strong_approx_Tn}]
+
+Define $Z_n^T(w) = \frac{Z_n^f(w)}{\sqrt{\Sigma_n(w,w)}}$ so that
+%
+\begin{align*}
+\left| T_n(w) - Z_n^T(w) \right|
+&= \frac{\big| \hat f_W(w) - f_W(w) - Z_n^f(w) \big|}
+{\sqrt{\Sigma_n(w,w)}}.
+\end{align*}
+%
+By Theorems~\ref{thm:kernel_app_strong_approx_fW} and \ref{thm:kernel_bias},
+the numerator can be bounded above by
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left|
+\hat f_W(w) - f_W(w)
+-
+Z_n^f(w)
+\right| \\
+&\quad\leq
+\sup_{w \in \cW}
+\left|
+\hat f_W(w)
+- \E\big[\hat f_W(w)\big]
+-
+Z_n^f(w)
+\right|
++ \sup_{w \in \cW}
+\left|
+\E\big[\hat f_W(w)\big]
+- f_W(w)
+\right| \\
+&\quad\lesssim_\P
+n^{-1} \log n
++ n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-7/6} h^{-1/2} (\log n)^{2/3}
++ h^{p \wedge \beta}.
+\end{align*}
+%
+By Lemma~\ref{lem:kernel_variance_bounds}
+with $\inf_\cW f_W(w) > 0$,
+the denominator is bounded below by
+%
+\begin{align*}
+\inf_{w \in \cW}
+\sqrt{\Sigma_n(w,w)}
+&\gtrsim
+\frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}},
+\end{align*}
+%
+and the result follows.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_infeasible_ucb}]
+
+Note that the covariance structure of $Z_n^T$ is given by
+%
+\begin{align*}
+\Cov\big[
+Z_n^T(w),
+Z_n^T(w')
+\big]
+&=
+\frac{\Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}.
+\end{align*}
+%
+We apply an anti-concentration result
+to establish that all quantiles of
+$\sup_{w \in \cW} \big|Z_n^T(w)\big|$ exist.
+To do this, we must first establish regularity
+properties of $Z_n^T$.
+
+\proofparagraph{$L^2$ regularity of $Z_n^T$}
+
+Writing $k_{i j}'$ for $k_h(W_{i j},w')$ etc.,
+note that by Lemma~\ref{lem:kernel_app_covariance_structure},
+%
+\begin{align*}
+&\big|
+\Sigma_n(w,w')
+-
+\Sigma_n(w, w'')
+\big| \\
+&\quad=
+\left|
+\frac{2}{n(n-1)}
+\Cov\big[
+k_{i j},
+k_{i j}'
+\big]
++
+\frac{4(n-2)}{n(n-1)}
+\Cov\big[
+k_{i j},
+k_{i r}'
+\big]
+\right. \\
+&\left.
+\quad\qquad-
+\frac{2}{n(n-1)}
+\Cov\big[
+k_{i j},
+k_{i j}''
+\big]
+-
+\frac{4(n-2)}{n(n-1)}
+\Cov\big[
+k_{i j},
+k_{i r}''
+\big]
+\right| \\
+&\quad\leq
+\frac{2}{n(n-1)}
+\Big|
+\Cov\big[
+k_{i j},
+k_{i j}' - k_{i j}''
+\big]
+\Big|
++
+\frac{4(n-2)}{n(n-1)}
+\Big|
+\Cov\big[
+k_{i j},
+k_{i r}' - k_{i r}''
+\big]
+\Big| \\
+&\quad\leq
+\frac{2}{n(n-1)}
+\|k_{i j}\|_\infty
+\|k_{i j}' - k_{i j}''\|_\infty
++
+\frac{4(n-2)}{n(n-1)}
+\|k_{i j}\|_\infty
+\|k_{i r}' - k_{i r}''\|_\infty \\
+&\quad\leq
+\frac{4}{n h^3}
+C_\rk C_\rL
+|w'-w''|
+\lesssim
+n^{-1}h^{-3} |w'-w''|
+\end{align*}
+%
+uniformly in $w, w', w'' \in \cW$.
+Therefore, by Lemma~\ref{lem:kernel_variance_bounds},
+with $\delta_n \leq n^{-2} h^2$,
+we have
+%
+\begin{align*}
+\inf_{|w-w'| \leq \delta_n}
+\Sigma_n(w,w')
+&\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}
+- n^{-1} h^{-3} \delta_n
+\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}
+- \frac{1}{n^3h}
+\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}, \\
+\sup_{|w-w'| \leq \delta_n}
+\Sigma_n(w,w')
+&\lesssim
+\frac{\Du^2}{n}
++ \frac{1}{n^2h}
++ n^{-1} h^{-3} \delta_n
+\lesssim
+\frac{\Du^2}{n}
++ \frac{1}{n^2h}
++ \frac{1}{n^3h}
+\lesssim
+\frac{\Du^2}{n}
++ \frac{1}{n^2h}.
+\end{align*}
+%
+The $L^2$
+regularity of $Z_n^T$
+is
+%
+\begin{align*}
+\E\left[
+\big(
+Z_n^T(w) - Z_n^T(w')
+\big)^2
+\right]
+&=
+2 - 2
+\frac{\Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}.
+\end{align*}
+%
+Applying the elementary result
+that for $a,b,c > 0$,
+%
+\begin{align*}
+1 - \frac{a}{\sqrt{b c}}
+&=
+\frac{b(c-a) + a(b-a)}
+{\sqrt{b c}\big(\sqrt{b c} + a\big)},
+\end{align*}
+%
+with $a = \Sigma_n(w,w')$,
+$b = \Sigma_n(w,w)$,
+and $c = \Sigma_n(w',w')$,
+and noting $|c-a| \lesssim n^{-1} h^{-3} |w-w'|$
+and $|b-a| \lesssim n^{-1} h^{-3} |w-w'|$ and
+$\frac{\Dl^2}{n} + \frac{1}{n^2h}
+\lesssim a,b,c \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}$,
+yields
+%
+\begin{align*}
+\E\left[
+\big(
+Z_n^T(w) - Z_n^T(w')
+\big)^2
+\right]
+&\lesssim
+\frac{(\Du^2/n + 1/(n^2h))n^{-1}h^{-3}|w-w'|}
+{(\Dl^2/n + 1/(n^2h))^2} \\
+&\lesssim
+\frac{n^{2} h^{-4}|w-w'|}
+{n^{-4}h^{-2}}
+\lesssim
+n^2 h^{-2} |w-w'|.
+\end{align*}
+%
+Thus the semimetric
+induced by $Z_n^T$ on $\cW$ is
+%
+\begin{align*}
+\rho(w,w')
+&\vcentcolon=
+\E\left[
+\big(
+Z_n^T(w) - Z_n^T(w')
+\big)^2
+\right]^{1/2}
+\lesssim
+n h^{-1} \sqrt{|w-w'|}.
+\end{align*}
+
+\proofparagraph{trajectory regularity of $Z_n^T$}
+
+By the bound on $\rho$ from the previous part,
+we deduce the covering number bound
+%
+\begin{align*}
+N(\varepsilon, \cW, \rho)
+&\lesssim
+N\big(
+\varepsilon,
+\cW,
+n h^{-1} \sqrt{|\cdot|}
+\big)
+\lesssim
+N\big(
+n^{-1} h \varepsilon,
+\cW,
+\sqrt{|\cdot|}
+\big) \\
+&\lesssim
+N\big(
+n^{-2} h^2 \varepsilon^2,
+\cW,
+|\cdot|
+\big)
+\lesssim
+n^2 h^{-2} \varepsilon^{-2}.
+\end{align*}
+%
+Now apply the Gaussian process regularity result from
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal}.
+%
+\begin{align*}
+\E\left[
+\sup_{\rho(w,w') \leq \delta}
+\big| Z_n^T(w) - Z_n^T(w') \big|
+\right]
+&\lesssim
+\int_0^{\delta}
+\sqrt{\log N(\varepsilon, \cW, \rho)}
+\diff{\varepsilon}
+\lesssim
+\int_0^{\delta}
+\sqrt{\log (n^2 h^{-2} \varepsilon^{-2})}
+\diff{\varepsilon} \\
+&\lesssim
+\int_0^{\delta}
+\left(
+\sqrt{\log n}
++ \sqrt{\log 1/\varepsilon}
+\right)
+\diff{\varepsilon}
+\lesssim
+\delta
+\left(
+\sqrt{\log n}
++ \sqrt{\log 1/\delta}
+\right),
+\end{align*}
+%
+and so
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big| Z_n^T(w) - Z_n^T(w') \big|
+\right]
+&\lesssim
+\E\left[
+\sup_{\rho(w,w') \leq n h^{-1} \delta_n^{1/2}}
+\big| Z_n^T(w) - Z_n^T(w') \big|
+\right]
+\lesssim
+n h^{-1}
+\sqrt{\delta_n \log n},
+\end{align*}
+%
+whenever $1/\delta_n$
+is at most polynomial in $n$.
+
+\proofparagraph{existence of the quantile}
+
+Apply the Gaussian anti-concentration
+result from Lemma~\ref{lem:kernel_app_anticoncentration},
+noting that $Z_n^T$ is separable,
+mean-zero, and has unit variance:
+%
+\begin{align*}
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq 2\varepsilon_n
+\right)
+&\leq
+8 \varepsilon_n
+\left(
+1 + \E\left[
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\right]
+\right).
+\end{align*}
+%
+To bound the supremum on the right hand side,
+apply the Gaussian process maximal inequality from
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal}
+with
+$\sigma \leq 1$ and
+$N(\varepsilon, \cW, \rho) \lesssim n^2 h^{-2} \varepsilon^{-2}$:
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\big|Z_n^T(w)\big|
+\right]
+&\lesssim
+1
++ \int_0^{2}
+\sqrt{\log (n^2 h^{-2} \varepsilon^{-2})}
+\diff{\varepsilon}
+\lesssim
+\sqrt{\log n}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq \varepsilon
+\right)
+&\lesssim
+\varepsilon
+\sqrt{\log n}.
+\end{align*}
+%
+Letting $\varepsilon \to 0$
+shows that the distribution function of
+$\sup_{w \in \cW} \big|Z_n^T(w)\big|$
+is continuous,
+and therefore all of its quantiles exist.
+
+\proofparagraph{validity of the infeasible uniform confidence band}
+
+Under Assumption~\ref{ass:kernel_rates} and with a
+sufficiently slowly diverging sequence $R_n$,
+the strong approximation rate established in
+Theorem~\ref{thm:kernel_strong_approx_Tn} is
+%
+\begin{align*}
+&\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\
+&\quad\lesssim_\P
+\frac{
+n^{-1/2} \log n
++ n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-2/3} h^{-1/2} (\log n)^{2/3}
++ n^{1/2} h^{p \wedge \beta}}
+{\Dl + 1/\sqrt{n h}}
+\ll \frac{1}{\sqrt{\log n}}.
+\end{align*}
+%
+So by Lemma~\ref{lem:kernel_app_slow_convergence}, take $\varepsilon_n$ such
+that
+%
+\begin{align*}
+\P \left(
+\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right|
+> \varepsilon_n
+\right)
+&\leq
+\varepsilon_n \sqrt{\log n}
+\end{align*}
+%
+and $\varepsilon_n \sqrt{\log n} \to 0$.
+So by the previously established anti-concentration result,
+%
+\begin{align*}
+&\P\left(
+\left|
+\hat f_W(w) - f_W(w)
+\right|
+\leq
+q_{1-\alpha}
+\sqrt{\Sigma_n(w,w)}
+\textup{ for all }
+w \in \cW
+\right) \\
+&\quad=
+\P\left(
+\sup_{w \in \cW}
+\left| T_n(w) \right|
+\leq
+q_{1-\alpha}
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\left| Z_n^T(w) \right|
+\leq
+q_{1-\alpha}
++ \varepsilon_n
+\right)
++ \P \left(
+\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right|
+> \varepsilon_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq
+q_{1-\alpha}
+\right)
++ \P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- q_{1-\alpha}
+\right|
+\leq \varepsilon_n
+\right)
++ \varepsilon_n \sqrt{\log n} \\
+&\quad\leq
+1 - \alpha
++ 2 \varepsilon_n \sqrt{\log n}.
+\end{align*}
+%
+The lower bound follows analogously:
+%
+\begin{align*}
+&\P\left(
+\left|
+\hat f_W(w) - f_W(w)
+\right|
+\leq
+q_{1-\alpha}
+\sqrt{\Sigma_n(w,w)}
+\textup{ for all }
+w \in \cW
+\right) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\left| Z_n^T(w) \right|
+\leq
+q_{1-\alpha}
+- \varepsilon_n
+\right)
+- \varepsilon_n \sqrt{\log n} \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq
+q_{1-\alpha}
+\right)
+- \P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- q_{1-\alpha}
+\right|
+\leq \varepsilon_n
+\right)
+- \varepsilon_n \sqrt{\log n} \\
+&\quad\leq
+1 - \alpha
+- 2 \varepsilon_n \sqrt{\log n}.
+\end{align*}
+%
+Finally, we apply $\varepsilon_n \sqrt{\log n} \to 0$
+to see
+%
+\begin{align*}
+\left|
+\P\left(
+\left|
+\hat f_W(w) - f_W(w)
+\right|
+\leq
+q_{1-\alpha}
+\sqrt{\Sigma_n(w,w)}
+\textup{ for all }
+w \in \cW
+\right)
+- (1 - \alpha)
+\right|
+&\to 0.
+\end{align*}
+\end{proof}
+
+Before proving
+Lemma~\ref{lem:kernel_app_covariance_estimation},
+we provide the following useful
+concentration inequality.
+This is essentially a corollary of the
+U-statistic concentration inequality given in
+Theorem~3.3 in \citet{gine2000exponential}.
+
+\begin{lemma}[A concentration inequality]
+\label{lem:kernel_app_dyadic_concentration}
+
+Let $X_{i j}$ be mutually independent for $1 \leq i < j \leq n$
+taking values in a measurable space $\cX$.
+Let $h_1$, $h_2$ be measurable functions from $\cX$ to $\R$
+satisfying the following for all $i$ and $j$.
+%
+\begin{align*}
+\E\big[h_1(X_{i j})\big]
+&= 0,
+&\E\big[h_2(X_{i j})\big]
+&=0, \\
+\E\big[h_1(X_{i j})^2\big]
+&\leq \sigma^2,
+&\E\big[h_2(X_{i j})^2\big]
+&\leq \sigma^2, \\
+\big|h_1(X_{i j})\big|
+&\leq M,
+&\big|h_2(X_{i j})\big|
+&\leq M.
+\end{align*}
+%
+Consider the sum
+%
+\begin{align*}
+S_n
+&=
+\sum_{1 \leq i < j < r \leq n}
+h_1(X_{i j})
+h_2(X_{i r}).
+\end{align*}
+%
+Then $S_n$ satisfies the concentration inequality
+%
+\begin{align*}
+\P\big(
+|S_n| \geq t
+\big)
+&\leq
+C \exp\left(
+-\frac{1}{C}
+\min \left\{
+\frac{t^2}{n^3 \sigma^4},
+\frac{t}{\sqrt{n^3 \sigma^4}},
+\frac{t^{2/3}}{(n M \sigma)^{2/3}},
+\frac{t^{1/2}}{M}
+\right\}
+\right)
+\end{align*}
+%
+for some universal constant
+$C > 0$
+and for all $t>0$.
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_dyadic_concentration}]
+
+We proceed in three main steps.
+Firstly, we write $S_n$ as a second-order U-statistic
+where we use double indices instead of single indices.
+Then we use a decoupling result to introduce extra independence.
+Finally, a concentration result is applied
+to the decoupled U-statistic.
+
+\proofparagraph{writing $S_n$ as a second-order U-statistic}
+
+Note that we can write $S_n$ as
+the second-order U-statistic
+%
+\begin{align*}
+S_n
+&=
+\sum_{1 \leq i < j \leq n}
+\sum_{1 \leq q < r \leq n}
+h_{i j q r}
+(X_{i j}, X_{qr}),
+\end{align*}
+%
+where
+%
+\begin{align*}
+h_{i j q r}
+(a,b)
+&=
+h_1(a) h_2(b) \,
+\I\{j<r,\, q=i\}.
+\end{align*}
+%
+Although this may look like a fourth-order
+U-statistic,
+it is in fact second-order.
+This is due to independence of the variables
+$X_{i j}$,
+and by treating $(i,j)$ as a single index.
+
+\proofparagraph{decoupling}
+
+By the decoupling result of Theorem~1
+from \citet{delapena1995decoupling}, there exists a universal
+constant $C_1 > 0$ satisfying
+%
+$\P\big( |S_n| \geq t \big)
+\leq C_1 \P\big( C_1 |\tilde S_n| \geq t \big)$,
+%
+where
+%
+$\tilde S_n = \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n}
+h_{i j q r} (X_{i j}, X'_{qr})$,
+%
+with $(X'_{i j})$
+an independent copy of $(X_{i j})$.
+
+\proofparagraph{U-statistic concentration}
+
+The U-statistic kernel $h_{i j q r}(X_{i j}, X'_{qr})$
+is totally degenerate in that
+%
+$ \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X_{i j}]
+= \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X'_{qr}] = 0$.
+%
+Define and bound the following quantities:
+%
+\pagebreak
+%
+\begin{align*}
+A
+&=
+\max_{i j q r}
+\|h_{i j q r}(X_{i j}, X'_{qr})\|_\infty
+\leq M^2, \\
+B
+&=
+\max
+\left\{
+\left\|
+\sum_{1 \leq i < j \leq n}
+\E\Big[
+h_{i j q r}(X_{i j}, X'_{qr})^2
+\mid X_{i j}
+\Big]
+\right\|_\infty,
+\left\|
+\sum_{1 \leq q < r \leq n}
+\E\Big[
+h_{i j q r}(X_{i j}, X'_{qr})^2
+\mid X'_{qr}
+\Big]
+\right\|_\infty
+\right\}^{1/2} \\
+&=
+\max
+\left\{
+\left\|
+\sum_{1 \leq i < j \leq n}
+h_1(X_{i j})^2
+\E\big[
+h_2(X_{qr}')^2
+\big]
+\I\{j<r, q=i\}
+\right\|_\infty,
+\right. \\
+&\left.
+\qquad\qquad\quad
+\left\|
+\sum_{1 \leq q < r \leq n}
+\E\big[
+h_1(X_{i j})^2
+\big]
+h_2(X_{qr}')^2
+\I\{j<r, q=i\}
+\right\|_\infty
+\right\}^{1/2} \\
+&\leq
+\max
+\left\{
+n^2 M^2 \sigma^2,
+n M^2 \sigma^2
+\right\}^{1/2}
+=
+n M \sigma, \\
+C
+&=
+\left(
+\sum_{1 \leq i < j \leq n}
+\sum_{1 \leq q < r \leq n}
+\!\E\big[
+h_{i j q r}(X_{i j}, X'_{qr})^2
+\big]
+\right)^{\!1/2}
+\!\!\!\!\! = \left(
+\sum_{1 \leq i < j < r \leq n}
+\!\!\E\big[
+h_1(X_{i j})^2
+h_2(X_{i r}')^2
+\big]
+\right)^{\!1/2}
+\!\!\!\!\! \leq
+\sqrt{n^3 \sigma^4}, \\
+D
+&=
+\sup_{f,g} \Bigg\{
+\sum_{1 \leq i < j \leq n}
+\sum_{1 \leq q < r \leq n}
+\E\big[
+h_{i j q r}(X_{i j}, X'_{qr})
+f_{i j}(X_{i j})
+g_{qr}(X'_{qr})
+\big]
+\ : \\
+&\qquad\qquad\quad
+\sum_{1 \leq i < j \leq n}
+\E\big[f_{i j}(X_{i j})^2\big]
+\leq 1, \
+\sum_{1 \leq q < r \leq n}
+\E\big[g_{qr}(X'_{qr})^2\big]
+\leq 1
+\Bigg\} \\
+&=
+\sup_{f,g} \Bigg\{
+\sum_{1 \leq i < j < r \leq n}
+\E\big[
+h_1(X_{i j})
+f_{i j}(X_{i j})
+\big]
+\E\big[
+h_2(X'_{i r})
+g_{i r}(X'_{i r})
+\big]
+\ : \\
+&\qquad\qquad\quad
+\sum_{1 \leq i < j \leq n}
+\E\big[f_{i j}(X_{i j})^2\big]
+\leq 1, \
+\sum_{1 \leq q < r \leq n}
+\E\big[g_{qr}(X'_{qr})^2\big]
+\leq 1
+\Bigg\} \\
+&\leq
+\sup_{f,g} \Bigg\{
+\sum_{1 \leq i < j < r \leq n}
+\E\big[ h_1(X_{i j})^2 \big]^{1/2}
+\E\big[ f_{i j}(X_{i j})^2 \big]^{1/2}
+\E\big[ h_2(X_{i r}')^2 \big]^{1/2}
+\E\big[ g_{i r}(X'_{i r})^2 \big]^{1/2}
+\ : \\
+&\qquad\qquad\quad
+\sum_{1 \leq i < j \leq n}
+\E\big[f_{i j}(X_{i j})^2\big]
+\leq 1, \
+\sum_{1 \leq q < r \leq n}
+\E\big[g_{qr}(X'_{qr})^2\big]
+\leq 1
+\Bigg\} \\
+&\leq
+\sigma^2
+\sup_{f,g} \Bigg\{
+\sum_{1 \leq i < j \leq n}
+\E\big[ f_{i j}(X_{i j})^2 \big]^{1/2}
+\sum_{1 \leq r \leq n }
+\E\big[ g_{i r}(X'_{i r})^2 \big]^{1/2}
+\ : \\
+&\qquad\qquad\quad
+\sum_{1 \leq i < j \leq n}
+\E\big[f_{i j}(X_{i j})^2\big]
+\leq 1, \
+\sum_{1 \leq q < r \leq n}
+\E\big[g_{qr}(X'_{qr})^2\big]
+\leq 1
+\Bigg\} \\
+&\leq
+\sigma^2
+\sup_{f,g} \Bigg\{
+\Bigg(
+n^2
+\sum_{1 \leq i < j \leq n}
+\E\big[ f_{i j}(X_{i j})^2 \big]
+\Bigg)^{1/2}
+\Bigg(
+n
+\sum_{1 \leq r \leq n }
+\E\big[ g_{i r}(X'_{i r})^2 \big]
+\Bigg)^{1/2}
+\ : \\
+&\qquad\qquad\quad
+\sum_{1 \leq i < j \leq n}
+\E\big[f_{i j}(X_{i j})^2\big]
+\leq 1, \
+\sum_{1 \leq q < r \leq n}
+\E\big[g_{qr}(X'_{qr})^2\big]
+\leq 1
+\Bigg\}
+\leq
+\sqrt{n^3 \sigma^4}.
+\end{align*}
+%
+By Theorem~3.3 in \citet{gine2000exponential},
+for some universal constant $C_2 > 0$ and for all $t > 0$,
+%
+\begin{align*}
+\P\left(
+|\tilde S_n| \geq t
+\right)
+&\leq
+C_2 \exp\left(
+-\frac{1}{C_2}
+\min \left\{
+\frac{t^2}{C^2},
+\frac{t}{D},
+\frac{t^{2/3}}{B^{2/3}},
+\frac{t^{1/2}}{A^{1/2}}
+\right\}
+\right) \\
+&\leq
+C_2 \exp\left(
+-\frac{1}{C_2}
+\min \left\{
+\frac{t^2}{n^3 \sigma^4},
+\frac{t}{\sqrt{n^3 \sigma^4}},
+\frac{t^{2/3}}{(n M \sigma)^{2/3}},
+\frac{t^{1/2}}{M}
+\right\}
+\right).
+\end{align*}
+
+\proofparagraph{Conclusion}
+
+By the previous parts
+and absorbing constants into a new constant $C > 0$,
+we therefore have
+%
+\begin{align*}
+\P\left(
+|S_n| \geq t
+\right)
+&\leq
+C_1 \P\left(
+C_1 |\tilde S_n| \geq t
+\right) \\
+&\leq
+C_1 C_2 \exp\left(
+-\frac{1}{C_2}
+\min \left\{
+\frac{t^2}{n^3 \sigma^4 C_1^2},
+\frac{t}{\sqrt{n^3 \sigma^4 C_1}},
+\frac{t^{2/3}}{(n M \sigma C_1)^{2/3}},
+\frac{t^{1/2}}{M C_1^{1/2}}
+\right\}
+\right) \\
+&\leq
+C \exp\left(
+-\frac{1}{C}
+\min \left\{
+\frac{t^2}{n^3 \sigma^4},
+\frac{t}{\sqrt{n^3 \sigma^4}},
+\frac{t^{2/3}}{(n M \sigma)^{2/3}},
+\frac{t^{1/2}}{M}
+\right\}
+\right).
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_covariance_estimation}]
+
+Throughout this proof we will write
+$k_{i j}$ for $k_h(W_{i j},w)$ and
+$k_{i j}'$ for $k_h(W_{i j},w')$,
+in the interest of brevity.
+Similarly, we write $S_{i j r}$ to denote $S_{i j r}(w,w')$.
+The estimand and estimator are reproduced below for clarity.
+%
+\begin{align*}
+\Sigma_n(w,w')
+&=
+\frac{2}{n(n-1)}
+\E[k_{i j} k_{i j}']
++ \frac{4(n-2)}{n(n-1)}
+\E[k_{i j} k_{i r}']
+- \frac{4n-6}{n(n-1)}
+\E[k_{i j}]
+\E[k_{i j}'] \\
+\hat \Sigma_n(w,w')
+&=
+\frac{2}{n(n-1)}
+\frac{2}{n(n-1)}
+\sum_{i<j}
+k_{i j}
+k_{i j}'
++
+\frac{4(n-2)}{n(n-1)}
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r} \\
+&\quad-
+\frac{4n-6}{n(n-1)}
+\hat f_W(w)
+\hat f_W(w'),
+\end{align*}
+%
+where
+%
+$S_{i j r} = \frac{1}{6}
+\big( k_{i j} k_{i r}'
++ k_{i j} k_{jr}'
++ k_{i r} k_{i j}'
++ k_{i r} k_{jr}'
++ k_{jr} k_{i j}'
++ k_{jr} k_{i r}'
+\big).$
+%
+We will prove uniform consistency of each of the three terms separately.
+
+\proofparagraph{uniform consistency of the $\hat f_W(w) \hat f_W(w')$ term}
+
+By boundedness of $f_W$ and
+Theorem~\ref{thm:kernel_uniform_consistency},
+$\hat f_W$ is uniformly bounded in probability.
+Noting that
+$\E[\hat f_W(w)] = \E[k_{i j}]$
+and by Lemma~\ref{lem:kernel_variance_bounds},
+%
+\begin{align*}
+&\sup_{w,w' \in \cW}
+\left|
+\frac{
+\hat f_W(w) \hat f_W(w')
+- \E\big[k_{i j}\big] \E\big[k_{i j'}\big]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+=
+\sup_{w,w' \in \cW}
+\left|
+\frac{
+\hat f_W(w) \hat f_W(w')
+- \E\big[\hat f_W(w)\big] \E\big[\hat f_W(w')\big]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right| \\
+&\quad\leq
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
+{\sqrt{\Sigma_n(w,w)}}
+\hat f_W(w')
++ \frac{\hat f_W(w') - \E\big[\hat f_W(w')\big]}
+{\sqrt{\Sigma_n(w',w')}}
+\E\big[\hat f_W(w)]
+\right| \\
+&\quad\lesssim_\P
+\sup_{w \in \cW}
+\left|
+\frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
+{\sqrt{\Sigma_n(w,w)}}
+\right| \\
+&\quad\lesssim_\P
+\sup_{w \in \cW}
+\left|
+\frac{L_n(w)}
+{\sqrt{\Sigma_n(w,w)}}
+\right|
++ \sqrt{n^2h} \sup_{w \in \cW} \left| Q_n(w) \right|
++ \sqrt{n^2h} \sup_{w \in \cW} \left| E_n(w) \right| \\
+&\quad\lesssim_\P
+\sup_{w \in \cW}
+\left|
+\frac{L_n(w)}
+{\sqrt{\Sigma_n(w,w)}}
+\right|
++ \sqrt{n^2h} \frac{1}{n}
++ \sqrt{n^2h} \sqrt{\frac{\log n}{n^2h}}
+\lesssim_\P
+\sup_{w \in \cW}
+\left|
+\frac{L_n(w)}
+{\sqrt{\Sigma_n(w,w)}}
+\right|
++ \sqrt{\log n}.
+\end{align*}
+%
+Now consider the function class
+%
+\begin{align*}
+\cF
+&=
+\left\{
+a \mapsto
+\frac{
+\E\big[k_h(W_{i j},w) \mid A_i = a \big]
+- \E\big[k_h(W_{i j},w) \big]}
+{\sqrt{n \Sigma_n(w,w)}}:
+w \in \cW
+\right\},
+\end{align*}
+%
+noting that
+%
+\begin{align*}
+\frac{L_n(w)}
+{\Sigma_n(w,w)^{1/2}}
+&=
+\frac{1}{\sqrt n}
+\sum_{i=1}^n
+g_w(A_i)
+\end{align*}
+%
+is an empirical process evaluated at
+$g_w \in \cF$.
+By the lower bound on $\Sigma_n(w,w)$
+from Lemma~\ref{lem:kernel_variance_bounds}
+with $\inf_\cW f_W(w) > 0$ and since $n h \gtrsim \log n$,
+the class $\cF$ has a constant envelope function
+given by $F(a) \lesssim \sqrt{n h}$.
+Clearly, $M = \sup_a F(a) \lesssim \sqrt{n h}$.
+Also by definition of $\Sigma_n$
+and orthogonality of $L_n$, $Q_n$, and $E_n$,
+we have
+$\sup_{f \in \cF} \E[f(A_i)^2] \leq \sigma^2 = 1$.
+To verify a VC-type condition on $\cF$
+we need to establish the regularity of the process.
+By Lipschitz properties
+of $L_n$ and $\Sigma_n$
+derived in the proofs of Lemma~\ref{lem:kernel_uniform_concentration}
+and Theorem~\ref{thm:kernel_infeasible_ucb}
+respectively,
+we have
+%
+\begin{align*}
+\left|
+\frac{L_n(w)}
+{\sqrt{\Sigma_n(w,w)}}
+- \frac{L_n(w')}
+{\sqrt{\Sigma_n(w',w')}}
+\right|
+&\lesssim
+\frac{\big|L_n(w) - L_n(w')\big|}
+{\sqrt{\Sigma_n(w,w)}}
++
+\left| L_n(w') \right|
+\left|
+\frac{1}
+{\sqrt{\Sigma_n(w,w)}}
+- \frac{1}
+{\sqrt{\Sigma_n(w',w')}}
+\right| \\
+&\lesssim
+\sqrt{n^2h}
+|w-w'|
++\left|
+\frac{\Sigma_n(w,w) - \Sigma_n(w',w')}
+{\Sigma_n(w,w)\sqrt{\Sigma_n(w',w')}}
+\right| \\
+&\lesssim
+\sqrt{n^2h}
+|w-w'|
++ (n^2h)^{3/2}
+\left|
+\Sigma_n(w,w) - \Sigma_n(w',w')
+\right| \\
+&\lesssim
+\sqrt{n^2h}
+|w-w'|
++ (n^2h)^{3/2}
+n^{-1} h^{-3}
+|w-w'|
+\lesssim
+n^4 |w-w'|,
+\end{align*}
+%
+uniformly over $w,w' \in \cW$. By compactness of $\cW$ we have the covering
+number bound
+%
+$N(\cF, \|\cdot\|_\infty, \varepsilon) \lesssim
+N(\cW, |\cdot|, n^{-4} \varepsilon) \lesssim n^4 \varepsilon^{-1}$.
+%
+Thus by Lemma~\ref{lem:kernel_app_maximal_vc_inid},
+%
+\begin{align*}
+\E \left[
+\sup_{w \in \cW}
+\left|
+\frac{L_n(w)}
+{\sqrt{\Sigma_n(w,w)}}
+\right|
+\right]
+&\lesssim
+\sqrt{\log n}
++ \frac{\sqrt{n h} \log n}{\sqrt{n}}
+\lesssim
+\sqrt{\log n}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left|
+\frac{
+\hat f_W(w) \hat f_W(w')
+- \E\big[k_{i j}\big] \E\big[k_{i j'}\big]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+&\lesssim_\P
+\sqrt{\log n}.
+\end{align*}
+
+\proofparagraph{decomposition of the $S_{i j r}$ term}
+
+We first decompose the $S_{i j r}$ term into two parts,
+and obtain a pointwise concentration result for each.
+This is extended to a uniform concentration result
+by considering the regularity of the covariance estimator process.
+Note that
+$\E[S_{i j r}] = \E[k_{i j} k_{i r}']$,
+and hence
+%
+\begin{align*}
+&\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\big(
+S_{i j r}
+- \E[k_{i j} k_{i r}']
+\big) \\
+&\quad=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r}^{(1)}
++ \frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r}^{(2)},
+\end{align*}
+%
+where $S_{i j r}^{(1)} = S_{i j r} - \E[S_{i j r} \mid \bA_n]$
+and $S_{i j r}^{(2)} = \E[S_{i j r} \mid \bA_n] - \E[S_{i j r}]$.
+
+\proofparagraph{pointwise concentration of the $S_{i j r}^{(1)}$ term}
+
+By symmetry in $i, j$, and $r$
+it is sufficient to consider only the first summand
+in the definition of $S_{i j r}$.
+By conditional independence properties,
+we have the decomposition
+%
+\begin{align}
+\nonumber
+&\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+k_{i j}k_{i r}'
+- \E[k_{i j}k_{i r}' \mid \bA_n]
+\Big) \\
+\nonumber
+&\quad=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+k_{i j}k_{i r}'
+- \E[k_{i j} \mid \bA_n]
+\E[k_{i r}' \mid \bA_n]
+\Big) \\
+\nonumber
+&\quad=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+\big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\big)
+\big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\big) \\
+\nonumber
+&\qquad+
+\big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\big)
+\E[k_{i r}' \mid \bA_n]
++ \big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\big)
+\E[k_{i j} \mid \bA_n]
+\Big) \\
+\label{eq:kernel_app_Sijr1_decomp1}
+&\quad=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\Big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\Big) \\
+\label{eq:kernel_app_Sijr1_decomp2}
+&\qquad+
+\frac{2}{(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{j=i+1}^{n-1}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\cdot \frac{3}{n}
+\sum_{r=j+1}^n
+\E[k_{i r}' \mid \bA_n] \\
+\label{eq:kernel_app_Sijr1_decomp3}
+&\qquad+
+\frac{2}{(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{r=i+2}^n
+\Big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\Big)
+\cdot \frac{3}{n}
+\sum_{j=i+1}^{r-1}
+\E[k_{i j} \mid \bA_n].
+\end{align}
+%
+For the term in \eqref{eq:kernel_app_Sijr1_decomp1},
+note that conditional on $\bA_n$,
+we have that
+$k_{i j} - \E[k_{i j} \mid \bA_n]$
+are conditionally mean-zero
+and conditionally independent,
+as the only randomness is from $\bV_n$.
+Also
+$\Var[k_{i j} \mid \bA_n] \lesssim \sigma^2 := 1/h$
+and
+$|k_{i j}| \lesssim M := 1/h$
+uniformly.
+The same is true for $k_{i j}'$.
+Thus by Lemma~\ref{lem:kernel_app_dyadic_concentration}
+for some universal constant $C_1 > 0$:
+%
+\begin{align*}
+&\P\left(
+\left|
+\sum_{i<j<r}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\Big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\Big)
+\right|
+> t
+\biggm\vert \bA_n
+\right) \\
+&\quad\leq
+C_1 \exp\left(
+-\frac{1}{C_1}
+\min \left\{
+\frac{t^2}{n^3 \sigma^4},
+\frac{t}{\sqrt{n^3 \sigma^4}},
+\frac{t^{2/3}}{(n M \sigma)^{2/3}},
+\frac{t^{1/2}}{M}
+\right\}
+\right) \\
+&\quad\leq
+C_1 \exp\left(
+-\frac{1}{C_1}
+\min \left\{
+\frac{t^2 h^2}{n^3},
+\frac{t h}{\sqrt{n^3}},
+\frac{t^{2/3} h}{n^{2/3}},
+t^{1/2} h
+\right\}
+\right),
+\end{align*}
+%
+and therefore
+with $t \geq 1$
+and since
+$n h \gtrsim \log n$,
+introducing and adjusting a new
+constant $C_2$ where necessary,
+%
+\begin{align*}
+&\P\left(
+\left|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\Big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\Big)
+\right|
+> t
+\frac{\log n}{\sqrt{n^3 h^2}}
+\Bigm\vert \bA_n
+\right) \\
+&\quad\leq
+\P\left(
+\left|
+\sum_{i<j<r}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\Big(
+k_{i r}'
+- \E[k_{i r}' \mid \bA_n]
+\Big)
+\right|
+> t
+n^{3/2} h^{-1} \log n / 24
+\Bigm\vert \bA_n
+\right) \\
+&\quad\leq
+C_2 \exp\left(
+-\frac{1}{C_2}
+\min \left\{
+(t \log n)^2,
+t \log n,
+(t \log n)^{2/3} (n h)^{1/3},
+(t n h \log n)^{1/2} n^{1/4}
+\right\}
+\right) \\
+&\quad\leq
+C_2 \exp\left(
+-\frac{1}{C_2}
+\min \left\{
+t \log n,
+t \log n,
+t^{2/3} \log n,
+t^{1/2} n^{1/4} \log n
+\right\}
+\right) \\
+&\quad=
+C_2 \exp\left(
+-\frac{t^{2/3} \log n}{C_2}
+\right)
+=
+C_2
+n^{-t^{2/3} / C_2}.
+\end{align*}
+%
+Now for the term
+in \eqref{eq:kernel_app_Sijr1_decomp2},
+note that
+$\frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n]$
+is $\bA_n$-measurable and bounded uniformly in $i,j$.
+Also, using the previously established conditional variance
+and almost sure bounds on $k_{i j}$,
+Bernstein's inequality
+(Lemma~\ref{lem:kernel_app_bernstein})
+applied conditionally
+gives for some constant $C_3 > 0$
+%
+\begin{align*}
+&\P\left(
+\Bigg|
+\frac{2}{(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{j=i+1}^{n-1}
+\Big(
+k_{i j}
+- \E[k_{i j} \mid \bA_n]
+\Big)
+\cdot \frac{3}{n}
+\sum_{r=j+1}^n
+\E[k_{i r}' \mid \bA_n]
+\Bigg|
+> t
+\sqrt{\frac{\log n}{n^2h}}
+\Bigm\vert \bA_n
+\right) \\
+&\qquad\leq
+2 \exp \left( -
+\frac{t^2 n^2 \log n / (n^2h)}
+{C_3/(2h) + C_3 t \sqrt{\log n / (n^2h)} / (2h)}
+\right) \\
+&\qquad=
+2 \exp \left( -
+\frac{t^2 \log n}
+{C_3/2 + C_3 t \sqrt{\log n / (n^2h)} / 2}
+\right)
+\leq
+2 \exp \left( -
+\frac{t^2 \log n}{C_3}
+\right)
+=
+2 n^{-t^2 / C_3}.
+\end{align*}
+%
+The term in \eqref{eq:kernel_app_Sijr1_decomp3}
+is controlled in exactly the same way.
+Putting these together, noting the symmetry in $i,j,r$
+and taking a marginal expectation,
+we obtain the unconditional pointwise concentration inequality
+%
+\begin{align*}
+\P\left(
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r}^{(1)}
+\Bigg|
+> t
+\frac{\log n}{\sqrt{n^3h^2}}
++ t \sqrt{\frac{\log n}{n^2h}}
+\right)
+&\leq
+C_2 n^{-t^{2/3} / C_2}
++ 4 n^{-t^2 / (4C_3)}.
+\end{align*}
+%
+Multiplying by
+$\big(\Sigma_n(w,w) + \Sigma_n(w',w')\big)^{-1/2} \lesssim \sqrt{n^2h}$
+gives (adjusting constants if necessary)
+%
+\begin{align*}
+&\P\left(
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r}^{(1)}}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\Bigg|
+> t \frac{\log n}{\sqrt{n h}}
++ t \sqrt{\log n}
+\right) \\
+&\quad\leq
+C_2 n^{-t^{2/3} / C_2}
++ 4 n^{-t^2 / (4C_3)}.
+\end{align*}
+
+\proofparagraph{pointwise concentration of the $S_{i j r}^{(2)}$ term}
+
+We apply the U-statistic concentration inequality from
+Lemma~\ref{lem:kernel_app_ustat_concentration}.
+Note that the terms
+$\E[S_{i j r} \mid \bA_n]$
+are permutation-symmetric functions of
+the random variables
+$A_i, A_j$, and $A_r$ only,
+making $S_{i j r}^{(2)}$ the summands of
+a (non-degenerate) mean-zero third-order U-statistic.
+While we could apply a third-order Hoeffding decomposition
+here to achieve degeneracy,
+it is unnecessary as Lemma~\ref{lem:kernel_app_ustat_concentration}
+is general enough to deal with the non-degenerate case directly.
+The quantity of interest here is
+%
+\begin{align*}
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r}^{(2)}
+&=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\Big(
+\E[S_{i j r} \mid \bA_n]
+- \E[S_{i j r}]
+\Big).
+\end{align*}
+%
+Note that by conditional independence,
+%
+\begin{align*}
+\big|
+\E\big[
+k_{i j}k_{i r} \mid \bA_n
+\big]
+\big|
+&=
+\big|
+\E\big[
+k_{i j} \mid \bA_n
+\big]
+\E\big[
+k_{i r} \mid \bA_n
+\big]
+\big|
+\lesssim 1,
+\end{align*}
+%
+and similarly for the other summands in $S_{i j r}$,
+giving the almost sure bound
+$|S_{i j r}^{(2)}| \lesssim 1$. Also,
+%
+\begin{align*}
+\Var\big[ \E[k_{i j} \mid A_i] \E[k_{i r}' \mid A_i] \big]
+&\lesssim
+\Var\big[\E[k_{i j} \mid A_i]\big]
++ \Var\big[\E[k_{i r}' \mid A_i]\big] \\
+&\lesssim
+n \Var[L_n(w)] + n \Var[L_n(w')] \\
+&\lesssim
+n \Sigma_n(w,w) + n \Sigma_n(w',w')
+\end{align*}
+%
+and similarly for the other summands in $S_{i j r}$,
+giving the conditional variance bound
+%
+\begin{align*}
+\E[\E[S_{i j r}^{(2)} \mid A_i]^2] \lesssim
+n \Sigma_n(w,w) + n \Sigma_n(w',w').
+\end{align*}
+%
+So Lemma~\ref{lem:kernel_app_ustat_concentration}
+and Lemma~\ref{lem:kernel_variance_bounds}
+give the pointwise concentration inequality
+%
+\begin{align*}
+&\P\left(
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+S_{i j r}^{(2)}
+\Bigg|
+> t \sqrt{\log n} \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}
+\right) \\
+&\quad\leq
+4 \exp \left(
+- \frac{n t^2 (\Sigma_n(w,w) + \Sigma_n(w',w')) \log n}
+{C_4 (n\Sigma_n(w,w) + n\Sigma_n(w',w'))
++ C_4 t \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}\sqrt{\log n}}
+\right) \\
+&\quad\leq
+4 \exp \left(
+- \frac{t^2 \log n}
+{C_4
++ C_4 t (\Sigma_n(w,w) + \Sigma_n(w',w'))^{-1/2} \sqrt{\log n} / n}
+\right) \\
+&\quad\leq
+4 \exp \left(
+- \frac{t^2 \log n}
+{C_4
++ C_4 t \sqrt{h}}
+\right)
+\leq
+4 n^{-t^2 / C_4}
+\end{align*}
+%
+for some universal constant $C_4 > 0$
+(which may change from line to line),
+since the order of this U-statistic is fixed at three.
+
+\proofparagraph{concentration of the $S_{i j r}$ term on a mesh}
+
+Pick $\delta_n \to 0$
+with $\log 1/\delta_n \lesssim \log n$.
+Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
+with cardinality $O(1/\delta_n)$.
+Then $\cW_\delta \times \cW_\delta$
+is a $2\delta_n$-covering of $\cW \times \cW$
+with cardinality $O(1/\delta_n^2)$,
+under the Manhattan metric
+$d\big((w_1, w_1'), (w_2, w_2')\big)
+= |w_1 - w_2| + |w_1' - w_2'|$.
+By the previous parts,
+we have that for fixed $w$ and $w'$:
+%
+\begin{align*}
+&\P\Bigg(
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\Bigg|
+> t \frac{\log n}{\sqrt{n h}}
++ 2t \sqrt{\log n}
+\Bigg) \\
+&\quad\leq
+C_2 n^{-t^{2/3} / C_2}
++ 4 n^{-t^2 / (4C_3)}
++ 4 n^{-t^2 / C_4}.
+\end{align*}
+%
+Taking a union bound over $\cW_\delta \times \cW_\delta$,
+noting that $n h \gtrsim \log n$
+and adjusting constants gives
+%
+\begin{align*}
+&\P\Bigg(
+\sup_{w, w' \in \cW_\delta}
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\Bigg|
+> t \sqrt{\log n}
+\Bigg) \\
+&\quad\lesssim
+\delta_n^{-2}
+\Big(
+C_2 n^{-t^{2/3} / C_2}
++ 4 n^{-t^2 / (4C_3)}
++ 4 n^{-t^2 / C_4}
+\Big)
+\lesssim
+\delta_n^{-2}
+n^{-t^{2/3} / C_5},
+\end{align*}
+%
+for some constant $C_5 > 0$.
+
+\proofparagraph{regularity of the $S_{i j r}$ term}
+
+Next we bound the fluctuations in $S_{i j r}(w,w')$.
+Writing $k_{i j}(w)$ for $k_h(W_{i j},w)$,
+note that
+%
+\begin{align*}
+\big|
+k_{i j}(w_1)
+k_{i r}(w_1')
+- k_{i j}(w_2)
+k_{i r}(w_2')
+\big|
+&\lesssim
+\frac{1}{h}
+\big| k_{i j}(w_1) - k_{i j}(w_2) \big|
++
+\frac{1}{h}
+\big| k_{i r}(w_1') - k_{i r}(w_2') \big| \\
+&\lesssim
+\frac{1}{h^3}
+\Big(
+|w_1 - w_2|
++ |w_1' - w_2'|
+\Big),
+\end{align*}
+%
+by the Lipschitz property of the kernel,
+and similarly for the other summands in $S_{i j r}$.
+Therefore,
+%
+\begin{align*}
+\sup_{|w_1-w_2| \leq \delta_n}
+\sup_{|w_1'-w_2'| \leq \delta_n}
+\big|
+S_{i j r}(w_1, w_1')
+- S_{i j r}(w_2, w_2')
+\big|
+&\lesssim
+\delta_n h^{-3}.
+\end{align*}
+%
+Also as noted in the proof of Theorem~\ref{thm:kernel_infeasible_ucb},
+%
+\begin{align*}
+\sup_{|w_1-w_2| \leq \delta_n}
+\sup_{|w_1'-w_2'| \leq \delta_n}
+\big|
+\Sigma_n(w_1,w_1')
+-
+\Sigma_n(w_2, w_2')
+\big|
+&\lesssim
+\delta_n n^{-1}h^{-3}.
+\end{align*}
+%
+Therefore, since $\sqrt{\Sigma_n(w,w)} \gtrsim \sqrt{n^2h}$
+and $|S_{i j r}| \lesssim h^{-2}$,
+using
+$\frac{a}{\sqrt b} - \frac{c}{\sqrt d}
+= \frac{a-c}{\sqrt b} + c \frac{d-b}{\sqrt{b d} \sqrt{b+d}}$,
+%
+\begin{align*}
+&\sup_{|w_1-w_2| \leq \delta_n}
+\sup_{|w_1'-w_2'| \leq \delta_n}
+\left|
+\frac{S_{i j r}(w_1, w_1')}
+{\sqrt{\Sigma_n(w_1,w_1) + \Sigma_n(w_1',w_1')}}
+- \frac{S_{i j r}(w_2, w_2')}
+{\sqrt{\Sigma_n(w_2,w_2) + \Sigma_n(w_2',w_2')}}
+\right| \\
+&\quad\lesssim
+\delta_n h^{-3} \sqrt{n^2h}
++ h^{-2} \delta_n n^{-1} h^{-3} (n^2h)^{3/2}
+\lesssim
+\delta_n n h^{-5/2}
++ \delta_n n^{2} h^{-7/2}
+\lesssim
+\delta_n n^{6},
+\end{align*}
+%
+where in the last line we use that
+$1/h \lesssim n$.
+
+\proofparagraph{uniform concentration of the $S_{i j r}$ term}
+
+By setting
+$\delta_n = n^{-6} \sqrt{\log n}$,
+the fluctuations can be at most $\sqrt{\log n}$,
+so we have for $t \geq 1$
+%
+\begin{align*}
+&\P\Bigg(
+\sup_{w, w' \in \cW}
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\Bigg|
+> 2t \sqrt{\log n}
+\Bigg) \\
+&\quad\lesssim
+\delta_n^{-2}
+n^{-t^{2/3} / C_5}
+\lesssim
+n^{12-t^{2/3} / C_5}.
+\end{align*}
+%
+This converges to zero for any sufficiently large $t$, so
+%
+\begin{align*}
+\sup_{w, w' \in \cW}
+\Bigg|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r}(w,w') - \E[S_{i j r}(w,w')]}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\Bigg|
+&\lesssim_\P
+\sqrt{\log n}.
+\end{align*}
+
+\proofparagraph{decomposition of the $k_{i j}k_{i j}'$ term}
+
+We move on to the final term in
+the covariance estimator.
+We have the decomposition
+%
+\begin{align*}
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\Big(
+k_{i j} k_{i j}'
+- \E\big[k_{i j} k_{i j}']
+\Big)
+&=
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(1)}
++
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(2)},
+\end{align*}
+%
+where
+%
+\begin{align*}
+S_{i j}^{(1)}
+&=
+k_{i j} k_{i j}'
+- \E\big[ k_{i j} k_{i j}' \mid \bA_n \big],
+&S_{i j}^{(2)}
+&=
+\E\big[ k_{i j} k_{i j}' \mid \bA_n \big]
+- \E\big[ k_{i j} k_{i j}' \big].
+\end{align*}
+
+\proofparagraph{pointwise concentration of the $S_{i j}^{(1)}$ term}
+
+Conditioning on $\bA_n$,
+the variables $S_{i j}^{(1)}$
+are conditionally independent
+and conditionally mean-zero.
+They further satisfy
+$|S_{i j}^{(1)}| \lesssim h^{-2}$
+and the conditional variance bound
+$\E\big[\big( S_{i j}^{(1)} \big)^2 \mid \bA_n \big] \lesssim h^{-3}$.
+Therefore applying Bernstein's inequality
+(Lemma~\ref{lem:kernel_app_bernstein})
+conditional on $\bA_n$,
+we obtain the pointwise in $w,w'$
+concentration inequality
+%
+\begin{align*}
+&\P\left(
+\Bigg|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(1)}
+\Bigg|
+> t
+\sqrt{\frac{\log n}{n^2h^3}}
+\Bigm\vert \bA_n
+\right) \\
+&\quad\leq
+2 \exp\left(
+- \frac{t^2 n^2 \log n / (n^2h^3)}
+{C_6 h^{-3} / 2 + C_6 t h^{-2} \sqrt{\log n / (n^2h^3)} / 2}
+\right) \\
+&\quad\leq
+2 \exp\left(
+- \frac{t^2 \log n}
+{C_6 / 2 + C_6 t \sqrt{\log n / (n^2h)} / 2}
+\right)
+\leq
+2 \exp\left( - \frac{t^2 \log n}{C_6} \right)
+= 2 n^{-t^2 / C_6},
+\end{align*}
+%
+where $C_6$ is a universal positive constant.
+
+\proofparagraph{pointwise concentration of the $S_{i j}^{(2)}$ term}
+
+We apply the U-statistic concentration inequality from
+Lemma~\ref{lem:kernel_app_ustat_concentration}.
+Note that $S_{i j}^{(2)}$
+are permutation-symmetric functions of
+the random variables
+$A_i$ and $A_j$ only,
+making them the summands of
+a (non-degenerate) mean-zero second-order U-statistic.
+Note that
+$\big|S_{i j}^{(2)}\big| \lesssim h^{-1}$
+and so trivially
+$\E\big[\E[S_{i j}^{(2)} \mid A_i ]^2 \big] \lesssim h^{-2}$.
+Thus by Lemma~\ref{lem:kernel_app_ustat_concentration},
+since the order of this U-statistic is fixed at two,
+for some universal positive constant $C_7$ we have
+%
+\begin{align*}
+\P\left(
+\Bigg|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(2)}
+\Bigg|
+> t
+\sqrt{\frac{\log n}{n h^2}}
+\right)
+&\leq
+2 \exp\left(
+- \frac{t^2 n \log n / (n h^2)}
+{C_7 h^{-2} / 2 + C_7 t h^{-1} \sqrt{\log n / (n h^2)} / 2}
+\right) \\
+&\leq
+2 \exp\left(
+- \frac{t^2 \log n}
+{C_7 / 2 + C_7 t \sqrt{\log n / n} / 2}
+\right) \\
+&\leq
+2 \exp\left(
+- \frac{t^2 \log n}{C_7}
+\right)
+=
+2 n^{-t^2 / C_7}.
+\end{align*}
+
+\proofparagraph{concentration of the $k_{i j}k_{i j}'$ term on a mesh}
+
+As before, use a union bound
+on the mesh $\cW_\delta \times \cW_\delta$.
+%
+\begin{align*}
+&\P\left(
+\sup_{w,w' \in \cW_\delta}
+\left|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\Big(
+k_{i j} k_{i j}'
+- \E\big[k_{i j} k_{i j}']
+\Big)
+\right|
+> t \sqrt{\frac{\log n}{n^2h^3}}
++ t \sqrt{\frac{\log n}{n h^2}}
+\right) \\
+&\ \leq
+\P\!\left(
+\!\sup_{w,w' \in \cW_\delta}
+\Bigg|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(1)}
+\Bigg|
+> t
+\sqrt{\frac{\log n}{n^2h^3}}
+\right)
+\! + \P\!\left(
+\!\sup_{w,w' \in \cW_\delta}
+\Bigg|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+S_{i j}^{(2)}
+\Bigg|
+> t
+\sqrt{\frac{\log n}{n h^2}}
+\right) \\
+&\ \lesssim
+\delta_n^{-2} n^{-t^2 / C_6}
++ \delta_n^{-2} n^{-t^2 / C_7}.
+\end{align*}
+
+\proofparagraph{regularity of the $k_{i j}k_{i j}'$ term}
+
+As for the $S_{i j r}$ term,
+%
+$\big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big|
+\lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big)$.
+
+\proofparagraph{uniform concentration of the $k_{i j}k_{i j}'$ term}
+
+Setting $\delta_n = h^3\sqrt{\log n / (n h^2)}$,
+the fluctuations are at most $\sqrt{\log n / (n h^2)}$,
+so for $t \geq 1$
+%
+\begin{align*}
+&\P\left(
+\sup_{w,w' \in \cW}
+\left|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\Big(
+k_{i j} k_{i j}'
+- \E\big[k_{i j} k_{i j}']
+\Big)
+\right|
+> t \sqrt{\frac{\log n}{n^2h^3}}
++ 2t \sqrt{\frac{\log n}{n h^2}}
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w,w' \in \cW_\delta}
+\left|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\Big(
+k_{i j} k_{i j}'
+- \E\big[k_{i j} k_{i j}']
+\Big)
+\right|
+> t \sqrt{\frac{\log n}{n^2h^3}}
++ t \sqrt{\frac{\log n}{n h^2}}
+\right) \\
+&\qquad+
+\P\left(
+\sup_{|w_1-w_2| \leq \delta_n}
+\sup_{|w_1'-w_2'| \leq \delta_n}
+\big|
+k_{i j}(w_1)
+k_{i j}(w_1')
+- k_{i j}(w_2)
+k_{i j}(w_2')
+\big|
+> t \sqrt{\frac{\log n}{n h^2}}
+\right) \\
+&\quad\lesssim
+\delta_n^{-2} n^{-t^2 / C_6}
++ \delta_n^{-2} n^{-t^2 / C_7}
+\lesssim
+n^{1-t^2 / C_6} h^{-4}
++ n^{1-t^2 / C_7} h^{-4}
+\lesssim
+n^{5-t^2 / C_8},
+\end{align*}
+%
+where $C_8 > 0$ is a constant and
+in the last line we use $1/h \lesssim n$.
+This converges to zero for any sufficiently large $t$,
+so by Lemma~\ref{lem:kernel_variance_bounds} we have
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left|
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\frac{k_{i j} k_{i j}' - \E\big[k_{i j} k_{i j}']}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+&\lesssim_\P
+\left(
+\!\sqrt{\frac{\log n}{n^2h^3}}
++ \sqrt{\frac{\log n}{n h^2}}
+\right)
+\sqrt{n^2h}
+\lesssim_\P
+\sqrt{\frac{n \log n}{h}}.
+\end{align*}
+
+\proofparagraph{conclusion}
+
+By the uniform bounds derived in the previous parts,
+and with $n h \gtrsim \log n$, we conclude that
+%
+\begin{align*}
+&\sup_{w,w' \in \cW}
+\left|
+\frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+\leq
+\frac{2}{n(n-1)}
+\sup_{w,w' \in \cW}
+\left|
+\frac{2}{n(n-1)}
+\!\sum_{i<j}\!
+\frac{k_{i j} k_{i j}' - \E\big[k_{i j} k_{i j}']}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right| \\
+&\qquad+
+\frac{4(n-2)}{n(n-1)}
+\sup_{w,w' \in \cW}
+\left|
+\frac{6}{n(n-1)(n-2)}
+\sum_{i<j<r}
+\frac{S_{i j r} - \E\big[k_{i j} k_{i r}']}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right| \\
+&\qquad+
+\frac{4n-6}{n(n-1)}
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat f_W(w) \hat f_W(w') - \E[k_{i j}] \E[k_{i j}']}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right| \\
+&\quad\lesssim_\P
+\sqrt{\frac{\log n}{n^3h}}
++ \frac{\sqrt{\log n}}{n}
++ \frac{\sqrt{\log n}}{n}
+\lesssim_\P
+\frac{\sqrt{\log n}}{n}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_alternative_covariance_estimator}]
+
+Write $k_{i j}$ for $k_h(W_{i j},w)$
+if $i<j$ and $k_h(W_{j i},w)$ if $j<i$,
+and use a prime to denote evaluation at $w'$.
+Thus we write $S_i(w) = \frac{1}{n-1} \sum_{j \neq i} k_{i j}$.
+Let $\sum_{i \neq j \neq r}$ indicate all indices are distinct.
+%
+\begin{align*}
+\frac{4}{n^2}
+\sum_{i=1}^n
+S_i(w) S_i(w')
+&=
+\frac{4}{n^2}
+\sum_{i=1}^n
+\frac{1}{n-1}
+\sum_{j \neq i}
+k_{i j}
+\frac{1}{n-1}
+\sum_{r \neq i}
+k_{i r}'
+=
+\frac{4}{n^2(n-1)^2}
+\sum_{i=1}^n
+\sum_{j \neq i}
+\sum_{r \neq i}
+k_{i j}
+k_{i r}' \\
+&=
+\frac{4}{n^2(n-1)^2}
+\sum_{i=1}^n
+\sum_{j \neq i}
+\left(
+\sum_{r \neq i, r \neq j}
+k_{i j}
+k_{i r}'
++ k_{i j}
+k_{i j}'
+\right) \\
+&=
+\frac{4}{n^2(n-1)^2}
+\sum_{i \neq j \neq r}
+k_{i j}
+k_{i r}'
++ \frac{4}{n^2(n-1)^2}
+\sum_{i \neq j}
+k_{i j}
+k_{i j}' \\
+&=
+\frac{24}{n^2(n-1)^2}
+\sum_{i < j < r}
+S_{i j r}(w,w')
++ \frac{8}{n^2(n-1)^2}
+\sum_{i < j}
+k_{i j}
+k_{i j}' \\
+&=
+\hat \Sigma_n(w,w')
++ \frac{4}{n^2(n-1)^2}
+\sum_{i < j}
+k_{i j}
+k_{i r}'
++ \frac{4n-6}{n(n-1)}
+\hat f
+\hat f'.
+\end{align*}
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_sdp}]
+
+Firstly, we prove that the true covariance function
+$\Sigma_n$
+is feasible for the optimization problem
+\eqref{eq:kernel_app_sdp} in the sense that it satisfies the constraints.
+As a covariance function, it is symmetric and positive semi-definite.
+The Lipschitz constraint is established in the proof of
+Theorem~\ref{thm:kernel_infeasible_ucb}:
+%
+\begin{align*}
+\big| \Sigma_n(w,w') - \Sigma_n(w, w'') \big|
+&\leq
+\frac{4}{n h^3}
+C_\rk
+C_\rL
+|w'-w''|
+\end{align*}
+%
+for all $w,w',w'' \in \cW$.
+Denote the (random) objective function
+in \eqref{eq:kernel_app_sdp} by
+%
+\begin{align*}
+\objective(M) = \sup_{w,w' \in \cW}
+\left|
+\frac{M(w,w') - \hat\Sigma_n(w,w')}
+{\sqrt{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}}
+\right|.
+\end{align*}
+%
+By Lemma~\ref{lem:kernel_app_covariance_estimation}
+with $w = w'$ we deduce that
+$\sup_{w \in \cW}
+\left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right|
+\lesssim_\P \sqrt{h \log n}$
+and so
+%
+\begin{align*}
+\objective(\Sigma_n)
+&= \sup_{w,w' \in \cW}
+\left|
+\frac{\hat\Sigma_n(w,w') - \Sigma_n}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+\sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
+{\hat \Sigma_n(w,w) + \hat \Sigma_n(w',w')}} \\
+&\lesssim_\P
+\frac{\sqrt{\log n}}{n}
+\left(
+1 - \frac{\big|\hat \Sigma_n(w,w) - \Sigma_n(w,w)\big|}
+{\Sigma_n(w,w)}
+- \frac{\big|\hat \Sigma_n(w',w') - \Sigma_n(w',w')\big|}
+{\Sigma_n(w',w')}
+\right)^{-1/2} \\
+&\lesssim_\P
+\frac{\sqrt{\log n}}{n}
+\left(
+1 - \sqrt{h \log n}
+\right)^{-1/2}
+\lesssim_\P
+\frac{\sqrt{\log n}}{n}.
+\end{align*}
+%
+Since the objective function
+is non-negative and because we have established
+at least one feasible function $M$ with
+an almost surely finite objective value,
+we can conclude the following.
+Let $\objective^* = \inf_M \objective(M)$,
+where the infimum is over feasible functions $M$.
+Then for all $\varepsilon > 0$
+there exists a feasible function $M_\varepsilon$ with
+$\objective(M_\varepsilon) \leq \objective^* + \varepsilon$,
+and we call such a solution $\varepsilon$-optimal.
+Let $\hat \Sigma_n^+$ be an $n^{-1}$-optimal solution.
+Then
+%
+\begin{align*}
+\objective(\hat \Sigma_n^+)
+&\leq \objective^* + n^{-1}
+\leq \objective(\Sigma_n) + n^{-1}.
+\end{align*}
+%
+Thus by the triangle inequality,
+%
+\begin{align*}
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+&\leq
+\objective(\hat \Sigma_n^+)
++ \objective(\Sigma_n)
+\leq 2 \, \objective(\Sigma_n) + n^{-1}
+\lesssim_\P
+\frac{\sqrt{\log n}}{n}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_variance_estimator_bounds}]
+
+Since $\hat \Sigma_n^+$ is positive semi-definite,
+we must have $\hat \Sigma_n^+(w,w) \geq 0$.
+Now Lemma~\ref{lem:kernel_app_sdp}
+implies that for all $\varepsilon \in (0,1)$
+there exists a $C_\varepsilon$ such that
+%
+\begin{align*}
+&\P\left(
+\Sigma_n(w,w) - C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)}
+\leq
+\hat \Sigma_n^+(w,w)
+\right.
+\\
+&\left.
+\qquad\leq
+\Sigma_n(w,w) + C_\varepsilon \frac{\sqrt{\log n}}{n}
+\sqrt{\Sigma_n(w,w)},
+\quad \forall w \in \cW
+\right)
+\geq 1-\varepsilon.
+\end{align*}
+%
+Consider the function
+$g_a(t) = t - a \sqrt{t}$
+and note that it is increasing on $\{t \geq a^2/4\}$.
+Applying this with $t = \Sigma_n(w,w)$
+and $a = \frac{\sqrt{\log n}}{n}$,
+noting that by Lemma~\ref{lem:kernel_variance_bounds} we have
+$t = \Sigma_n(w,w) \gtrsim \frac{1}{n^2h}
+\gg \frac{\log n}{4n^2} = a^2/4$,
+shows that for $n$ large enough,
+%
+\begin{align*}
+\inf_{w \in \cW} \Sigma_n(w,w)
+- \frac{\sqrt{\log n}}{n} \sqrt{\inf_{w \in \cW} \Sigma_n(w,w)}
+\lesssim_\P
+\inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\
+\sup_{w \in \cW}\hat \Sigma_n^+(w,w)
+\lesssim_\P
+\sup_{w \in \cW} \Sigma_n(w,w)
++ \frac{\sqrt{\log n}}{n} \sqrt{\sup_{w \in \cW} \Sigma_n(w,w)}.
+\end{align*}
+%
+Applying the bounds from Lemma~\ref{lem:kernel_variance_bounds}
+yields
+%
+\begin{align*}
+\frac{\Dl^2}{n} + \frac{1}{n^2h}
+- \frac{\sqrt{\log n}}{n}
+\left( \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right)
+\lesssim_\P
+\inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\
+\sup_{w \in \cW}\hat \Sigma_n^+(w,w)
+\lesssim_\P
+\frac{\Du^2}{n} + \frac{1}{n^2h}
++ \frac{\sqrt{\log n}}{n}
+\left( \frac{\Du}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right)
+\end{align*}
+%
+and so
+%
+\begin{align*}
+\frac{\Dl^2}{n} + \frac{1}{n^2h}
+\lesssim_\P
+\inf_{w \in \cW}\hat \Sigma_n^+(w,w)
+\leq
+\sup_{w \in \cW}\hat \Sigma_n^+(w,w)
+\lesssim_\P
+\frac{\Du^2}{n} + \frac{1}{n^2h}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_sdp}]
+See Lemma~\ref{lem:kernel_app_covariance_estimation}
+and Lemma~\ref{lem:kernel_app_sdp}.
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_studentized_t_statistic}]
+%
+We have
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left| \hat T_n(w) - T_n(w) \right|
+=
+\sup_{w \in \cW}
+\bigg\{
+\left|
+\hat f_W(w) - f_W(w)
+\right|
+\cdot
+\bigg|
+\frac{1}
+{\hat\Sigma_n^+(w,w)^{1/2}}
+-
+\frac{1}{\Sigma_n(w,w)^{1/2}}
+\bigg|
+\bigg\} \\
+&\quad\leq
+\sup_{w \in \cW}
+\left|
+\frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
+{\sqrt{\Sigma_n(w,w)}}
++ \frac{\E\big[\hat f_W(w)\big] - f_W(w)}
+{\sqrt{\Sigma_n(w,w)}}
+\right|
+\cdot \sup_{w \in \cW}
+\left|
+\frac{\hat\Sigma_n^+(w,w) - \Sigma_n(w,w)}
+{\sqrt{\Sigma_n(w,w) \hat\Sigma_n^+(w,w)}}
+\right|.
+\end{align*}
+%
+Now from the proof of Lemma~\ref{lem:kernel_app_covariance_estimation} we
+have that
+$\sup_{w \in \cW} \left|
+\frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]}
+{\sqrt{\Sigma_n(w,w)}} \right|
+\lesssim_\P \sqrt{\log n}$,
+while Theorem~\ref{thm:kernel_bias} gives
+$\sup_{w \in \cW} \big| \E\big[\hat f_W(w)\big] - f_W(w) \big|
+\lesssim h^{p \wedge \beta}$.
+By Lemma~\ref{lem:kernel_variance_bounds},
+note that
+$\sup_{w \in \cW} \Sigma_n(w,w)^{-1/2}
+\lesssim \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$, and
+$\sup_{w \in \cW} \hat \Sigma_n^+(w,w)^{-1/2}
+\lesssim_\P \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$
+by Lemma~\ref{lem:kernel_app_variance_estimator_bounds}.
+Thus, applying Lemma~\ref{lem:kernel_app_sdp} to control the
+covariance estimation error,
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \hat T_n(w) - T_n(w) \right|
+&\lesssim_\P
+\left(
+\sqrt{\log n} + \frac{h^{p \wedge \beta}}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}
+\right)
+\frac{\sqrt{\log n}}{n}
+\frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \\
+&\lesssim_\P
+\sqrt{\frac{\log n}{n}}
+\left(
+\sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
+{\Dl + 1/\sqrt{n h}}
+\right)
+\frac{1}{\Dl + 1/\sqrt{n h}}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[%
+Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}]
+
+Firstly, note that $\hat Z_n^T$ exists
+by noting that $\hat \Sigma_n^+(w,w')$ and therefore also
+$\frac{\hat \Sigma_n^+(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$
+are positive semi-definite
+functions and appealing to the
+Kolmogorov consistency theorem \citep{gine2021mathematical}.
+To obtain the desired Kolmogorov--Smirnov result we discretize and
+use the Gaussian--Gaussian comparison result found in
+Lemma~3.1 in \citet{chernozhukov2013gaussian}.
+
+\proofparagraph{bounding the covariance discrepancy}
+
+Define the maximum discrepancy in the (conditional) covariances
+of $\hat Z_n^T$ and $Z_n^T$ by
+%
+\begin{align*}
+\Delta
+&\vcentcolon=
+\sup_{w, w' \in \cW}
+\left|
+\frac{\hat \Sigma_n^+(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+- \frac{\Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}
+\right|.
+\end{align*}
+%
+This variable can be bounded in probability
+in the following manner.
+First note that by the Cauchy--Schwarz inequality
+for covariances,
+$|\Sigma_n(w,w')| \leq
+\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}$.
+Hence
+%
+\begin{align*}
+\Delta
+&\leq
+\sup_{w, w' \in \cW}
+\left\{
+\left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\right|
++ \left|
+\frac{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}
+- \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\right|
+\right\} \\
+&\leq
+\sup_{w, w' \in \cW}
+\left\{
+\sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
+{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+\right\} \\
+&\quad+
+\sup_{w, w' \in \cW}
+\left|
+\frac{\hat \Sigma_n^+(w,w)\hat \Sigma_n^+(w',w')
+- \Sigma_n(w,w) \Sigma_n(w',w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
+\Sigma_n(w,w) \Sigma_n(w',w')}}
+\right|.
+\end{align*}
+%
+For the first term, note that
+$\inf_{w \in \cW} \hat \Sigma_n^+(w,w)
+\gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}$
+by Lemma~\ref{lem:kernel_app_variance_estimator_bounds} and also
+$\sup_{w \in \cW}
+\left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right|
+\lesssim_\P \sqrt{h \log n}$
+by the proof of Lemma~\ref{lem:kernel_app_sdp}.
+Thus by Lemma~\ref{lem:kernel_app_sdp},
+%
+\begin{align*}
+&\sup_{w, w' \in \cW}
+\left\{
+\sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')}
+{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\left|
+\frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')}
+{\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}}
+\right|
+\right\} \\
+&\quad\lesssim_\P
+\frac{\sqrt{\log n}}{n}
+\frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}
+\lesssim_\P
+\sqrt{\frac{\log n}{n}}
+\frac{1}{\Dl + 1/\sqrt{n h}}.
+\end{align*}
+%
+For the second term, we have by the same bounds
+%
+\begin{align*}
+&\sup_{w, w' \in \cW}
+\left|
+\frac{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
+- \Sigma_n(w,w) \Sigma_n(w',w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
+\Sigma_n(w,w) \Sigma_n(w',w')}}
+\right| \\
+&\quad\leq
+\sup_{w, w' \in \cW}
+\left\{
+\frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|
+\hat \Sigma_n^+(w',w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
+\Sigma_n(w,w) \Sigma_n(w',w')}}
+\right\} \\
+&\qquad+
+\sup_{w, w' \in \cW}
+\left\{
+\frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|
+\Sigma_n(w,w)}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')
+\Sigma_n(w,w) \Sigma_n(w',w')}}
+\right\} \\
+&\quad\leq
+\sup_{w, w' \in \cW}
+\left\{
+\frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|}
+{\sqrt{\Sigma_n(w,w)}}
+\frac{\sqrt{\hat \Sigma_n^+(w',w')}}
+{\sqrt{\hat \Sigma_n^+(w,w) \Sigma_n(w',w')}}
+\right\} \\
+&\qquad+
+\!\sup_{w, w' \in \cW}\!
+\left\{
+\frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|}
+{\sqrt{\Sigma_n(w',w')}}
+\frac{\sqrt{\Sigma_n(w,w)}}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}
+\right\}
+\lesssim_\P
+\sqrt{\frac{\log n}{n}}
+\frac{1}{\Dl + 1/\sqrt{n h}}.
+\end{align*}
+%
+Therefore
+$\Delta \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}$.
+
+\proofparagraph{Gaussian comparison on a mesh}
+
+Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$
+with cardinality $O(1/\delta_n)$,
+where $1/\delta_n$ is at most polynomial in $n$.
+The scaled (conditionally) Gaussian
+processes $Z_n^T$ and $\hat Z_n^T$
+both have pointwise (conditional) variances of 1.
+Therefore, by Lemma~3.1 in \citet{chernozhukov2013gaussian},
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW_\delta}
+Z_n^T(w)
+\leq t
+\right)
+- \P\left(
+\sup_{w \in \cW_\delta}
+\hat Z_n^T(w)
+\leq t
+\Bigm\vert \bW_n
+\right)
+\right|
+&\lesssim
+\Delta^{1/3}
+\Big(
+1 \vee \log \frac{1}{\Delta \delta_n}
+\Big)^{2/3}
+\end{align*}
+%
+uniformly in the data. By the previous part and
+since $x (\log 1/x)^2$ is increasing on $\big(0, e^{-2}\big)$,
+%
+\begin{align*}
+&\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW_\delta}
+Z_n^T(w)
+\leq t
+\right)
+- \P\left(
+\sup_{w \in \cW_\delta}
+\hat Z_n^T(w)
+\leq t
+\Bigm\vert \bW_n
+\right)
+\right| \\
+&\quad\lesssim_\P
+\left(
+\sqrt{\frac{\log n}{n}}
+\frac{1}{\Dl + 1/\sqrt{n h}}
+\right)^{1/3}
+(\log n)^{2/3}
+\lesssim_\P
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}.
+\end{align*}
+
+\proofparagraph{trajectory regularity of $Z_n^T$}
+
+In the proof of Theorem~\ref{thm:kernel_infeasible_ucb}
+we established that $Z_n^T$ satisfies the regularity property
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\big| Z_n^T(w) - Z_n^T(w') \big|
+\right]
+&\lesssim
+n h^{-1}
+\sqrt{\delta_n \log n},
+\end{align*}
+%
+whenever $1/\delta_n$
+is at most polynomial in $n$.
+
+\proofparagraph{conditional $L^2$ regularity of $\hat Z_n^T$}
+
+By Lemma~\ref{lem:kernel_app_sdp},
+with $n h \gtrsim \log n$,
+we have
+uniformly in $w,w'$,
+%
+\begin{align*}
+\big|
+\hat \Sigma_n^+(w,w')
+- \hat \Sigma_n^+(w,w)
+\big|
+&\lesssim
+n^{-1} h^{-3} |w-w'|.
+\end{align*}
+%
+Taking
+$\delta_n \leq n^{-2} h^2$,
+Lemma~\ref{lem:kernel_app_variance_estimator_bounds}
+gives
+%
+\begin{align*}
+\inf_{|w-w'| \leq \delta_n}
+\hat \Sigma_n^+(w,w')
+\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}
+- n^{-1} h^{-3} \delta_n
+\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}
+- \frac{1}{n^3h}
+\gtrsim
+\frac{\Dl^2}{n}
++ \frac{1}{n^2h}.
+\end{align*}
+%
+The conditional $L^2$
+regularity of $\hat Z_n^T$ is
+%
+\begin{align*}
+\E\left[
+\big(
+\hat Z_n^T(w) - \hat Z_n^T(w')
+\big)^2
+\bigm\vert \bW_n
+\right]
+&=
+2 - 2
+\frac{\hat \Sigma_n^+(w,w')}
+{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}.
+\end{align*}
+%
+Applying the same elementary result as for $Z_n^T$
+in the proof of Theorem~\ref{thm:kernel_infeasible_ucb} yields
+%
+\begin{align*}
+\E\left[
+\big(
+\hat Z_n^T(w) - \hat Z_n^T(w')
+\big)^2
+\bigm\vert \bW_n
+\right]
+&\lesssim_\P
+n^2 h^{-2} |w-w'|.
+\end{align*}
+%
+Thus the conditional semimetric
+induced by $\hat Z_n^T$ on $\cW$ is
+%
+\begin{align*}
+\hat\rho(w,w')
+&\vcentcolon=
+\E\left[
+\big(
+\hat Z_n^T(w) - \hat Z_n^T(w')
+\big)^2
+\bigm\vert \bW_n
+\right]^{1/2}
+\lesssim_\P
+n h^{-1} \sqrt{|w-w'|}.
+\end{align*}
+
+\proofparagraph{conditional trajectory regularity of $\hat Z_n^T$}
+
+As for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb},
+we apply Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+now conditionally, to obtain
+%
+\begin{align*}
+\E\left[
+\sup_{|w-w'| \leq \delta_n}
+\left| \hat Z_n^T(w) - \hat Z_n^T(w') \right|
+\Bigm\vert \bW_n
+\right]
+&\lesssim_\P
+n h^{-1}
+\sqrt{\delta_n \log n},
+\end{align*}
+%
+whenever $1/\delta_n$
+is at most polynomial in $n$.
+
+\proofparagraph{uniform Gaussian comparison}
+
+Now we use the trajectory regularity properties to
+extend the Gaussian--Gaussian comparison result from a finite mesh
+to all of $\cW$.
+Write the previously established
+approximation rate as
+%
+\begin{align*}
+r_n
+&=
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}.
+\end{align*}
+%
+Take $\varepsilon_n > 0$ and observe that
+uniformly in $t \in \R$,
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\leq t
+\Bigm\vert \bW_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW_\delta}
+\big| \hat Z_n^T(w) \big|
+\leq t + \varepsilon_n
+\Bigm\vert \bW_n
+\right)
++ \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW_\delta}
+\big| Z_n^T(w) \big|
+\leq t + \varepsilon_n
+\right)
++ O_\P(r_n)
++ \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t + 2\varepsilon_n
+\right)
++ O_\P(r_n)
++ \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+Z_n^T(w)
+- Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\right) \\
+&\qquad+
+\P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t + 2\varepsilon_n
+\right)
++ O_\P(r_n)
++ O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t
+\right)
++ \P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq 2\varepsilon_n
+\right) \\
+&\qquad+
+O_\P(r_n)
++ O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}).
+\end{align*}
+%
+The converse inequality is obtained
+analogously as follows:
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\leq t
+\Bigm\vert \bW_n
+\right) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW_\delta}
+\big| \hat Z_n^T(w) \big|
+\leq t - \varepsilon_n
+\Bigm\vert \bW_n
+\right)
+- \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW_\delta}
+\big| Z_n^T(w) \big|
+\leq t - \varepsilon_n
+\right)
+- O_\P(r_n)
+- \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t - 2\varepsilon_n
+\right)
+- O_\P(r_n)
+- \P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+Z_n^T(w)
+- Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\right) \\
+&\qquad-
+\P\left(
+\sup_{|w-w'| \leq \delta_n}
+\left|
+\hat Z_n^T(w)
+- \hat Z_n^T(w')
+\right|
+\geq \varepsilon_n
+\Bigm\vert \bW_n
+\right) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t - 2\varepsilon_n
+\right)
+- O_\P(r_n)
+- O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t
+\right)
+- \P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq 2\varepsilon_n
+\right) \\
+&\qquad-
+O_\P(r_n)
+- O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}).
+\end{align*}
+%
+Combining these uniform upper and lower bounds gives
+%
+\begin{align*}
+&\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\leq t
+\Bigm\vert \bW_n
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t
+\right)
+\right| \\
+&\qquad\lesssim_\P
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq 2\varepsilon_n
+\right)
++ r_n
++ \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}.
+\end{align*}
+%
+For the remaining term, apply anti-concentration
+for $Z_n^T$ from the proof of Theorem~\ref{thm:kernel_infeasible_ucb}:
+%
+\begin{align*}
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+- t
+\right|
+\leq \varepsilon
+\right)
+&\lesssim
+\varepsilon
+\sqrt{\log n}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+&\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\leq t
+\Bigm\vert \bW_n
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t
+\right)
+\right| \\
+&\qquad\lesssim_\P
+\varepsilon_n \sqrt{\log n}
++ r_n
++ \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}.
+\end{align*}
+%
+Taking $\varepsilon = r_n / \sqrt{\log n}$
+and then $\delta_n = n^{-2} h r_n^2 \varepsilon_n^2 / \log n$
+yields
+%
+\begin{align*}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\leq t
+\Bigm\vert \bW_n
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\big| Z_n^T(w) \big|
+\leq t
+\right)
+\right|
+&\lesssim_\P
+r_n =
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}]
+
+\proofparagraph{Kolmogorov--Smirnov approximation}
+
+Let $Z_n^T$ and $\hat Z_n^T$ be defined
+as in the proof of
+Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
+Write
+%
+\begin{align*}
+r_n
+&=
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}
+\end{align*}
+%
+for the rate of approximation from
+Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
+For any $\varepsilon_n > 0$ and uniformly in $t \in \R$:
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\left|
+\hat Z_n^T(w)
+\right|
+\leq t
+\Bigm\vert \bW_n
+\right)
+\leq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq t
+\right)
++
+O_\P(r_n) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq t - \varepsilon_n
+\right)
++
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big|
+Z_n^T(w)
+\big|
+-t
+\right|
+\leq \varepsilon_n
+\right)
++
+O_\P(r_n) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
++
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) - Z_n^T(w) \right|
+\geq \varepsilon_n
+\right) \\
+&\qquad+
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big|
+Z_n^T(w)
+\big|
+-t
+\right|
+\leq \varepsilon_n
+\right)
++
+O_\P(r_n) \\
+&\quad\leq
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
++
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) - Z_n^T(w) \right|
+\geq \varepsilon_n
+\right)
++ \varepsilon_n \sqrt{\log n}
++ O_\P(r_n),
+\end{align*}
+%
+where in the last line we used the anti-concentration result
+from Lemma~\ref{lem:kernel_app_anticoncentration}
+applied to $Z_n^T$,
+as in the proof of
+Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}.
+The corresponding lower bound is as follows:
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\left|
+\hat Z_n^T(w)
+\right|
+\leq t
+\Bigm\vert \bW_n
+\right)
+\geq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq t
+\right)
+-
+O_\P(r_n) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\left|
+Z_n^T(w)
+\right|
+\leq t + \varepsilon_n
+\right)
+-
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big|
+Z_n^T(w)
+\big|
+-t
+\right|
+\leq \varepsilon_n
+\right)
+-
+O_\P(r_n) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) - Z_n^T(w) \right|
+\geq \varepsilon_n
+\right) \\
+&\qquad-
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big|
+Z_n^T(w)
+\big|
+-t
+\right|
+\leq \varepsilon_n
+\right)
+-
+O_\P(r_n) \\
+&\quad\geq
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) - Z_n^T(w) \right|
+\geq \varepsilon_n
+\right)
+- \varepsilon_n \sqrt{\log n}
+- O_\P(r_n).
+\end{align*}
+
+\proofparagraph{$t$-statistic approximation}
+
+To control the remaining term,
+note that by
+Theorem~\ref{thm:kernel_strong_approx_Tn}
+and Lemma~\ref{lem:kernel_app_studentized_t_statistic},
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left| \hat T_n(w) - Z_n^T(w) \right| \\
+&\quad\leq
+\sup_{w \in \cW}
+\left| \hat T_n(w) - T_n(w) \right|
++ \sup_{w \in \cW}
+\left| T_n(w) - Z_n^T(w) \right| \\
+&\quad\lesssim_\P
+\sqrt{\frac{\log n}{n}}
+\left(
+\sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}}
+{\Dl + 1/\sqrt{n h}}
+\right)
+\frac{1}{\Dl + 1/\sqrt{n h}} \\
+&\qquad+
+\frac{
+n^{-1/2} \log n
++ n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n
++ n^{-2/3} h^{-1/2} (\log n)^{2/3}
++ n^{1/2} h^{p \wedge \beta}}
+{\Dl + 1/\sqrt{n h}}
+\end{align*}
+%
+and denote this last quantity by $r_n'$.
+Then for any $\varepsilon_n \gg r_n'$,
+we have
+%
+\begin{align*}
+\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right)
+- \P\left(
+\sup_{w \in \cW}
+\left|
+\hat Z_n^T(w)
+\right|
+\leq t
+\Bigm\vert \bW_n
+\right)
+\right|
+&\lesssim_\P
+\varepsilon_n \sqrt{\log n}
++ r_n
++ o(1).
+\end{align*}
+
+\proofparagraph{rate analysis}
+
+This rate is $o_\P(1)$
+with an appropriate choice of $\varepsilon_n$ whenever
+$r_n \to 0$ and $r_n' \sqrt{\log n} \to 0$,
+by Lemma~\ref{lem:kernel_app_slow_convergence}, along with
+a slowly diverging sequence $R_n$. Explicitly, we require the following.
+%
+\begin{align*}
+\frac{n^{-1/2} (\log n)^{3/2}}{\Dl + 1/\sqrt{n h}}
+&\to 0,
+&\frac{h^{p \wedge \beta} \log n}{\Dl^2 + (n h)^{-1}}
+&\to 0, \\
+\frac{n^{-1/2} (\log n)^{3/2}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0,
+&\frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0, \\
+\frac{n^{-2/3} h^{-1/2} (\log n)^{7/6}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0,
+&\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0, \\
+\frac{n^{-1/6}(\log n)^{5/6}}
+{\Dl^{1/3} + (n h)^{-1/6}}
+&\to 0.
+\end{align*}
+%
+Using the fact that $h \lesssim n^{-\varepsilon}$
+for some $\varepsilon > 0$
+and removing trivial statements leaves us with
+%
+\begin{align*}
+\frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0,
+&\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}}
+{\Dl + 1/\sqrt{n h}}
+&\to 0.
+\end{align*}
+%
+We analyze these based on the degeneracy
+and verify that they hold under Assumption~\ref{ass:kernel_rates}.
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item No degeneracy:
+if $\Dl > 0$ then we need
+%
+\begin{align*}
+n^{-3/4} h^{-7/8} (\log n)^{7/8}
+&\to 0,
+&n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}
+&\to 0.
+\end{align*}
+%
+These reduce to
+$n^{-6/7} \log n \ll h
+\ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$.
+
+\item Partial or total degeneracy:
+if $\Dl = 0$ then we need
+%
+\begin{align*}
+n^{-1/4} h^{-3/8} (\log n)^{7/8}
+&\to 0,
+&n h^{(p \wedge \beta) + 1/2} (\log n)^{1/2}
+&\to 0.
+\end{align*}
+%
+These reduce to
+$n^{-2/3} (\log n)^{7/3} \ll h
+\ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$.
+%
+\end{enumerate}
+
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_ucb}]
+
+\proofparagraph{existence of the conditional quantile}
+
+We argue as in the proof of
+Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian},
+now also conditioning on the data.
+In particular, using the anti-concentration result from
+Lemma~\ref{lem:kernel_app_anticoncentration},
+the regularity property of $\hat Z_n^T$,
+and the Gaussian process maximal inequality from
+Lemma~\ref{lem:kernel_app_gaussian_process_maximal},
+we see that for any $\varepsilon > 0$,
+%
+\begin{align*}
+\sup_{t \in \R}
+\P\left(
+\left|
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+- t
+\right|
+\leq 2\varepsilon
+\Bigm\vert \bW_n
+\right)
+&\leq
+8 \varepsilon
+\left(
+1 + \E\left[
+\sup_{w \in \cW}
+\big| \hat Z_n^T(w) \big|
+\Bigm\vert \bW_n
+\right]
+\right)
+\lesssim \varepsilon \sqrt{\log n}.
+\end{align*}
+%
+Thus letting $\varepsilon \to 0$
+shows that the conditional distribution function of
+$\sup_{w \in \cW} \big|\hat Z_n^T(w)\big|$
+is continuous,
+and therefore all of its conditional quantiles exist.
+
+\proofparagraph{validity of the confidence band}
+
+Define the following (conditional) distribution functions.
+%
+\begin{align*}
+F_Z(t \mid \bW_n)
+&=
+\P\left(
+\sup_{w \in \cW}
+\left| \hat Z_n^T(w) \right|
+\leq t
+\Bigm\vert \bW_n
+\right),
+&F_T(t)
+&=
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq t
+\right),
+\end{align*}
+%
+along with their well-defined right-quantile functions,
+%
+\begin{align*}
+F_Z^{-1}(p \mid \bW_n)
+&=
+\sup
+\big\{
+t \in \R
+\, : \,
+F_Z(t \mid \bW_n)
+= p
+\big\},
+&F_T^{-1}(p)
+&=
+\sup
+\big\{
+t \in \R
+\, : \,
+F_T(t)
+= p
+\big\}.
+\end{align*}
+%
+Note that
+$t \leq F_Z^{-1}(p \mid \bW_n)$
+if and only if
+$F_Z(t \mid \bW_n) \leq p$.
+Take $\alpha \in (0,1)$ and
+define the quantile
+$\hat q_{1-\alpha} = F_Z^{-1}(1-\alpha \mid \bW_n)$,
+so that
+$F_Z(\hat q_{1-\alpha} \mid \bW_n) = 1-\alpha$.
+By Lemma~\ref{lem:kernel_app_feasible_gaussian_approx},
+%
+\begin{align*}
+\sup_{t \in \R}
+\big|
+F_Z(t \mid \bW_n) - F_T(t)
+\big|
+&=
+o_\P(1).
+\end{align*}
+%
+Thus by Lemma~\ref{lem:kernel_app_slow_convergence},
+this can be replaced by
+%
+\begin{align*}
+\P\left(
+\sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big|
+> \varepsilon_n
+\right)
+&\leq \varepsilon_n
+\end{align*}
+%
+for some $\varepsilon_n \to 0$.
+Therefore
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\left|
+\hat T_n(w)
+\right|
+\leq
+\hat q_{1-\alpha}
+\right)
+&=
+\P\left(
+\sup_{w \in \cW}
+\left|
+\hat T_n(w)
+\right|
+\leq
+F_Z^{-1}(1-\alpha \mid \bW_n)
+\right) \\
+&=
+\P\left(
+F_Z\left(
+\sup_{w \in \cW}
+\left|
+\hat T_n(w)
+\right|
+\Bigm\vert \bW_n
+\right)
+\leq
+1 - \alpha
+\right) \\
+&\leq
+\P\left(
+F_T\left(
+\sup_{w \in \cW}
+\left|
+\hat T_n(w)
+\right|
+\right)
+\leq
+1 - \alpha + \varepsilon_n
+\right)
++ \varepsilon_n
+\leq 1 - \alpha + 3\varepsilon_n,
+\end{align*}
+%
+where we used the fact that for any
+real-valued random variable $X$ with distribution function $F$,
+we have
+$\big|\P\big(F(X) \leq t\big) - t\big| \leq \Delta$,
+where $\Delta$ is the size of the
+largest jump discontinuity in $F$.
+By uniform integrability,
+$\sup_{t \in \R} \big| F_Z(t) - F_T(t) \big| = o(\varepsilon_n)$.
+Since $F_Z$ has no jumps,
+we must have $\Delta \leq \varepsilon_n$ for $F_T$.
+Finally, a lower bound is constructed in an analogous manner,
+giving
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\left| \hat T_n(w) \right|
+\leq
+\hat q_{1-\alpha}
+\right)
+&\geq
+1 - \alpha - 3\varepsilon_n.
+\end{align*}
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_bias}]
+
+Writing
+$k_{i j} = k_h(W_{i j}^1, w)$,
+$\psi_i = \psi(X_i^1)$,
+$\hat\psi_i = \hat\psi(X_i^1)$,
+and $\kappa_{i j} = \kappa(X_i^0, X_i^1, X_j^1)$,
+%
+\begin{align*}
+\E\big[\hat f_W^{1 \triangleright 0}(w)\big]
+&=
+\E\left[
+\frac{2}{n(n-1)}
+\sum_{i<j}
+\hat \psi_i
+\hat \psi_j
+k_{i j}
+\right] \\
+&=
+\frac{2}{n(n-1)(n-2)}
+\sum_{i < j}
+\sum_{r \notin \{i,j\}}
+\E\left[
+k_{i j}
+\Big(
+\psi_i
+\psi_j
++\psi_i
+\kappa_{r j}
++\psi_j
+\kappa_{r i}
+\Big)
+\right]
++ O\left(\frac{1}{n}\right) \\
+&=
+\E\left[
+k_{i j}
+\psi_i
+\psi_j
+\right]
++ O\left(\frac{1}{n}\right)
+=
+\E\big[
+\psi_i
+\psi_j
+\E\left[
+k_h(W_{i j}^1, w)
+\mid X_i^1, X_j^1
+\right]
+\big]
++ O\left(\frac{1}{n}\right) \\
+&=
+\E\big[
+\psi_i
+\psi_j
+f_{W \mid XX}^1(w \mid X_i^1, X_j^1)
++ O_\P\left( h^{p \wedge \beta} \right)
+\big]
++ O\left(\frac{1}{n}\right) \\
+&=
+f_W^{1 \triangleright 0}(w)
++ O\left( h^{p \wedge \beta} + \frac{1}{n}\right)
+\end{align*}
+%
+uniformly in $w$, by the proof of
+Theorem~\ref{thm:kernel_bias} and H{\"o}lder continuity
+of $f_{W \mid XX}^1$.
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_hoeffding}]
+%
+\begin{align*}
+\hat f_W^{1 \triangleright 0}(w)
+&=
+\frac{2}{n(n-1)}
+\sum_{i < j}
+\hat \psi_i
+\hat \psi_j
+k_{i j} \\
+&=
+\frac{2}{n(n-1)}
+\sum_{i < j}
+\left(
+\psi_i
++ \frac{1}{n}
+\sum_{r=1}^n \kappa_{r i}
+\right)
+\left(
+\psi_j
++ \frac{1}{n}
+\sum_{r=1}^n \kappa_{r j}
+\right)
+k_{i j}
++ O_\P\left(\frac{1}{n}\right) \\
+&=
+\frac{2}{n(n-1)}
+\sum_{i < j}
+\psi_i
+\psi_j
+k_{i j}
++ \frac{2}{n(n-1)}
+\sum_{i < j}
+\psi_i
+\frac{1}{n}
+\sum_{r \notin \{i,j\}}^n \kappa_{r j}
+k_{i j} \\
+&\quad+
+\frac{2}{n(n-1)}
+\sum_{i < j}
+\psi_j
+\frac{1}{n}
+\sum_{r \notin \{i,j\}}^n \kappa_{r i}
+k_{i j}
++ O_\P\left(\frac{1}{n}\right) \\
+&=
+\frac{2}{n(n-1)(n-2)}
+\sum_{i < j}
+\sum_{r \notin \{i,j\}}
+k_{i j}
+\Big(
+\psi_i
+\psi_j
++\psi_i
+\kappa_{r j}
++\psi_j
+\kappa_{r i}
+\Big)
++ O_\P\left(\frac{1}{n}\right) \\
+&=
+\frac{6}{n(n-1)(n-2)}
+\sum_{i < j < r}
+v_{i j r}
++ O_\P\left(\frac{1}{n}\right)
+\end{align*}
+%
+where
+%
+\begin{align*}
+v_{i j r}
+&=
+\frac{1}{3}
+k_{i j} \Big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \Big)
++ \frac{1}{3}
+k_{i r} \Big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \Big) \\
+&\quad+
+\frac{1}{3}
+k_{jr} \Big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \Big)
+\end{align*}
+%
+So by the Hoeffding decomposition for third-order U-statistics,
+%
+\begin{align*}
+\hat f_W^{1 \triangleright 0}(w)
+&=
+u
++ \frac{3}{n}
+\sum_{i=1}^n
+u_i
++ \frac{6}{n(n-1)}
+\sum_{i=1}^{n-1}
+\sum_{j=i+1}^n
+u_{i j}
++ \frac{6}{n(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{j=i+1}^{n-1}
+\sum_{r=j+1}^n
+u_{i j r} \\
+&\quad+
+\frac{6}{n(n-1)(n-2)}
+\sum_{i=1}^{n-2}
+\sum_{j=i+1}^{n-1}
+\sum_{r=j+1}^n
+\big(v_{i j r} - u_{i j r}\big)
++ O_\P\left( \frac{1}{n} \right) \\
+&=
+\E\big[\hat f_W^{1 \triangleright 0}(w) \big]
++ L_n^{1 \triangleright 0}(w)
++ Q_n^{1 \triangleright 0}(w)
++ T_n^{1 \triangleright 0}(w)
++ E_n^{1 \triangleright 0}(w)
++ O_\P\left( \frac{1}{n} \right).
+\end{align*}
+%
+Noting that $\psi_i$, $\kappa_{i j}$
+and $\E[k_{i j} \mid A_i^1, A_j^1]$
+are all bounded and that
+$\E[k_{i j} \mid A_i^1, A_j^1]$
+is Lipschitz in $w$,
+we deduce by
+Lemma~\ref{lem:kernel_app_uprocess_maximal}
+and Proposition~2.3 of
+\citet{arcones1993limit} that
+$\sup_{w \in \cW} |Q_n^{1 \triangleright 0}(w)
++ T_n^{1 \triangleright 0}(w)| \lesssim_\P \frac 1n$.
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_uniform_consistency}]
+
+By Lemma~\ref{lem:kernel_app_maximal_vc_inid},
+$\sup_{w \in \cW} \big|L_n^{1 \triangleright 0}(w)\big|
+\lesssim_\P \frac{1}{\sqrt n}$.
+In the proof of Lemma~\ref{lem:kernel_app_counterfactual_hoeffding}
+the terms $v_{i j r} - u_{i j r}$ depend only on
+$V_{i j}$, $V_{i r}$, and $V_{jr}$
+after conditioning on $\bA_n^1$, $\bX_n^0$, and $\bX_n^1$.
+Thus $E_n^{1 \triangleright 0}(w)$ is a degenerate second-order
+U-statistic so
+$\sup_{w \in \cW} \big|E_n^{1 \triangleright 0}(w)\big|
+\lesssim_\P \sqrt{\frac{\log n}{n^2h}}$
+by Lemma~\ref{lem:kernel_app_uprocess_maximal}.
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_sa}]
+
+Note that
+$L_n^{1 \triangleright 0}(w)
+= \frac 3n \sum_{i=1}^n l_i^{1 \triangleright 0}(w)$
+where $l_i^{1 \triangleright 0}(w)$ depends only on
+$A_i^1$, $X_i^0$, and $X_i^1$.
+Let $\gamma: \cX \times \cX \to \{1, \ldots, |\cX|^2\}$
+be a bijection and
+define $\logistic(x) = \frac{1}{1+e^{-x}}$.
+Let
+$\tilde A_i = \logistic(A_i^1) + \gamma(X_i^0, X_i^1)$
+so that
+$A_i^1 = \logistic^{-1}\big(\tilde A_i
+- \lfloor \tilde A_i \rfloor\big)$
+and
+$(X_i^0, X_i^1) = \gamma^{-1}(\lfloor \tilde A_i \rfloor)$.
+Thus
+$l_i^{1 \triangleright 0}(w)$ is a bounded-variation function
+of $\tilde A_i$, uniformly in $w$, and so as in
+Lemma~\ref{lem:kernel_app_strong_approx_Ln} we have that
+on an appropriately enlarged probability space,
+%
+\begin{align*}
+\E\left[
+\sup_{w \in \cW}
+\left|
+\sqrt n L_n^{1 \triangleright 0}(w)
+- Z_n^{L, 1 \triangleright 0}(w)
+\right|
+\right]
+\lesssim
+\frac{\log n}{\sqrt n}
+\end{align*}
+%
+where $Z_n^{L, 1 \triangleright 0}$ is a mean-zero
+Gaussian process with the same covariance as
+$\sqrt n L_n^{1 \triangleright 0}$.
+For $E_n^{1 \triangleright 0}(w)$,
+we first construct a strong approximation conditional on
+$\bA_n$ and $\bX_n$ as shown in
+Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}
+and deduce an unconditional strong approximation as in
+Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En} to see
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+\sqrt{n^2h} E_n^{1 \triangleright 0}(w)
+- Z_n^{E, 1 \triangleright 0}(w)
+\right|
+\lesssim_\P
+n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n
++ n^{-1/6} (\log n)^{2/3}
+\end{align*}
+%
+where $Z_n^{E, 1 \triangleright 0}$ is a mean-zero
+Gaussian process with the same covariance as
+$\sqrt{n^2h} E_n^{1 \triangleright 0}$.
+Arguing as in the proof of Theorem~\ref{thm:kernel_app_strong_approx_fW}
+shows that the Gaussian processes are independent
+and can be summed to yield a single strong approximation.
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_covariance_structure}]
+
+Arguing by mean-zero properties and conditional independence,
+%
+\begin{align*}
+&\Sigma_n^{1 \triangleright 0}(w,w')
+= \Cov\left[
+\hat f_W^{1 \triangleright 0}(w),
+\hat f_W^{1 \triangleright 0}(w')
+\right] \\
+&\quad=
+\frac{1}{n^2(n-1)^2(n-2)^2}
+\sum_{i \neq j}
+\sum_{r \notin \{i,j\}}
+\sum_{i' \neq j'}
+\sum_{r' \notin \{i',j'\}}
+\!\!\!\!\E\Big[
+\!\Big(
+k_{i j} \psi_i \psi_j
+- \E[k_{i j} \psi_i \psi_j]
++ k_{i j}\psi_i \kappa_{r j}
++ k_{i j}\psi_j \kappa_{r i}
+\Big) \\
+&\qquad\qquad\times
+\Big(
+k_{i' j'}' \psi_{i'} \psi_{j'}
+- \E[k_{i j}' \psi_i \psi_j]
++ k_{i' j'}'\psi_{i'} \kappa_{r' j'}
++ k_{i' j'}'\psi_{j'} \kappa_{r' i'}
+\Big)
+\Big]
++ O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right) \\
+&\quad=
+\frac{2}{n^2}
+\E\left[
+k_{i j} \psi_i \psi_j
+k_{i j}' \psi_i \psi_j
+\right]
++ \frac{4}{n}
+\E\left[
+k_{i j} \psi_i \psi_j
+k_{i r}' \psi_i \psi_r
+\right]
+- \frac{4}{n}
+\E\left[
+k_{i j} \psi_i \psi_j
+\right]
+\E\left[
+k_{i j}' \psi_i \psi_j
+\right] \\
+&\qquad+
+\frac{4}{n}
+\E\left[
+k_{i j}\psi_i \kappa_{i' j}
+k_{i' j'}' \psi_{i'} \psi_{j'}
+\right]
++ \frac{4}{n}
+\E\left[
+k_{i j} \psi_{i} \psi_{j}
+k_{i' j'}'\psi_{i'} \kappa_{i j'}
+\right]
++ \frac{4}{n}
+\E\left[
+k_{i j} k'_{i' j'}
+\psi_i \psi_{i'}
+\kappa_{r j} \kappa_{r j'}
+\right] \\
+&\qquad+
+O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right) \\
+&\quad=
+\frac{4}{n}
+\E\left[
+\Big(
+\psi_i
+\E\big[
+k_{i j} \psi_j
+\mid i
+\big]
++ \E\left[
+k_{r j} \psi_r \kappa_{i j}
+\mid i
+\right]
+\Big)
+\Big(
+\psi_i
+\E\big[
+k_{i j}' \psi_j
+\mid i
+\big]
++ \E\left[
+k_{r j}' \psi_r \kappa_{i j}
+\mid i
+\right]
+\Big)
+\right] \\
+&\qquad+
+\frac{2}{n^2}
+\E\left[
+k_{i j} k_{i j}'
+\psi_i^2 \psi_j^2
+\right]
+- \frac{4}{n}
+\E\left[
+k_{i j} \psi_i \psi_j
+\right]
+\E\left[
+k_{i j}' \psi_i \psi_j
+\right]
++ O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right),
+\end{align*}
+%
+where all indices are distinct.
+%
+\end{proof}
+
+\begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_infeasible_t_statistic}]
+The proof is exactly the same as the proof of
+Theorem~\ref{thm:kernel_strong_approx_Tn}.
+\end{proof}
+
+\begin{proof}[Theorem~\ref{thm:kernel_app_counterfactual_infeasible_ucb}]
+This proof proceeds in the same manner as the proof of
+Theorem~\ref{thm:kernel_infeasible_ucb}.
+\end{proof}
+
+\chapter[Supplement to Yurinskii's Coupling for Martingales]%
+{Supplement to Yurinskii's \\ Coupling for Martingales}
+\label{app:yurinskii}
+
+\section{Proofs of main results}
+\label{sec:yurinskii_app_proofs}
+
+\subsection{Preliminary lemmas}
+
+We give a sequence of preliminary lemmas which are useful for establishing our
+main results. Firstly, we present a conditional version of Strassen's theorem
+for the $\ell^p$-norm \citep[Theorem~B.2]{chen2020jackknife}, stated for
+completeness as Lemma~\ref{lem:yurinskii_app_strassen}.
+
+\begin{lemma}[A conditional Strassen theorem for the
+\texorpdfstring{$\ell^p$}{lp}-norm]%
+\label{lem:yurinskii_app_strassen}
+%
+Let $(\Omega, \cH, \P)$ be a probability space supporting the $\R^d$-valued
+variable $X$ for some $d \geq 1$. Let $\cH'$ be a countably generated
+sub-$\sigma$-algebra of $\cH$ and suppose there is a $\Unif[0,1]$ random
+variable on $(\Omega, \cH, \P)$, independent of the $\sigma$-algebra
+generated by $X$ and $\cH'$. Take a regular conditional distribution
+$F(\cdot \mid \cH')$ satisfying the following. Firstly, $F(A \mid \cH')$ is
+an $\cH'$-measurable variable for all Borel sets $A \in \cB(\R^d)$.
+Secondly, $F(\cdot \mid \cH')(\omega)$ is a Borel probability measure on
+$\R^d$ for all $\omega \in \Omega$. Taking $\eta, \rho > 0$ and
+$p \in [1, \infty]$, with $\E^*$ the outer expectation, if
+%
+\begin{align*}
+\E^* \left[
+\sup_{A \in \cB(\R^d)}
+\Big\{
+\P \big( X \in A \mid \cH' \big)
+- F \big( A_p^\eta \mid \cH' \big)
+\Big\}
+\right]
+\leq \rho,
+\end{align*}
+%
+where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$
+and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$,
+then there exists an $\R^d$-valued random variable $Y$
+with $Y \mid \cH' \sim F(\cdot \mid \cH')$
+and $\P \left( \|X-Y\|_p > \eta \right) \leq \rho$.
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_strassen}]
+By Theorem~B.2 in \citet{chen2020jackknife}, noting that the $\sigma$-algebra
+generated by $Z$ is countably generated and using the metric induced by the
+$\ell^p$-norm.
+\end{proof}
+
+Next, we present in Lemma~\ref{lem:yurinskii_app_smooth_approximation} an
+analytic result
+concerning the smooth approximation of Borel set indicator functions, similar
+to that given in \citet[Lemma~39]{belloni2019conditional}.
+
+\begin{lemma}[Smooth approximation of Borel indicator functions]%
+\label{lem:yurinskii_app_smooth_approximation}
+Let $A \subseteq \R^d$ be a Borel set and $Z \sim \cN(0, I_d)$.
+For $\sigma, \eta > 0$ and $p \in [1, \infty]$, define
+%
+\begin{align*}
+g_{A\eta}(x)
+&=
+\left( 1 - \frac{\|x-A^\eta\|_p}{\eta} \right) \vee 0
+& &\text{and}
+&f_{A\eta\sigma}(x)
+&=
+\E\big[g_{A\eta}(x + \sigma Z) \big].
+\end{align*}
+%
+Then $f$ is infinitely differentiable
+and with $\varepsilon = \P(\|Z\|_p > \eta / \sigma)$,
+for all $k \geq 0$,
+any multi-index $\kappa = (\kappa_1,\dots, \kappa_d)\in\N^d$,
+and all $x,y \in \R^d$,
+we have $|\partial^\kappa f_{A\eta\sigma}(x)| \leq
+\frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}$ and
+%
+\begin{align*}
+&\Bigg|
+f_{A\eta\sigma}(x+y) - \sum_{|\kappa| = 0}^k
+\frac{1}{\kappa!}
+\partial^\kappa f_{A\eta\sigma}(x)
+y^\kappa
+\Bigg|
+\leq
+\frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}, \\
+&(1 - \varepsilon) \I\big\{x \in A\big\}
+\leq f_{A\eta\sigma}(x)
+\leq \varepsilon + (1 - \varepsilon)
+\I\big\{x \in A^{3\eta}\big\}.
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_smooth_approximation}]
+Drop subscripts on $g_{A\eta}$ and $f_{A \eta \sigma}$.
+By Taylor's theorem with Lagrange remainder, for $t \in [0,1]$,
+%
+\begin{align*}
+\Bigg|
+f(x + y)
+- \sum_{|\kappa|=0}^{k}
+\frac{1}{\kappa!}
+\partial^{\kappa} f(x)
+y^\kappa
+\Bigg|
+\leq
+\Bigg|
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\big(
+\partial^{\kappa} f(x + t y)
+- \partial^{\kappa} f(x)
+\big)
+\Bigg|.
+\end{align*}
+%
+Now with $\phi(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$,
+%
+\begin{align*}
+f(x)
+&=
+\E\big[g(x + \sigma W) \big]
+=
+\int_{\R^d}
+g(x + \sigma u)
+\prod_{j=1}^{d}
+\phi(u_j)
+\diff u
+=
+\frac{1}{\sigma^d}
+\int_{\R^d}
+g(u)
+\prod_{j=1}^{d}
+\phi \left( \frac{u_j-x_j}{\sigma} \right)
+\diff u
+\end{align*}
+%
+and since the integrand is bounded, we exchange differentiation and
+integration to compute
+%
+\begin{align}
+\nonumber
+\partial^\kappa
+f(x)
+&=
+\frac{1}{\sigma^{d+|\kappa|}}
+\int_{\R^d}
+g(u)
+\prod_{j=1}^{d}
+\partial^{\kappa_j}
+\phi \left( \frac{u_j-x_j}{\sigma} \right)
+\diff u
+= \left( \frac{-1}{\sigma} \right)^{|\kappa|}
+\int_{\R^d}
+g(x + \sigma u)
+\prod_{j=1}^{d}
+\partial^{\kappa_j}
+\phi(u_j)
+\diff u \\
+\label{eq:yurinskii_app_smoothing_derivative}
+&=
+\left( \frac{-1}{\sigma} \right)^{|\kappa|}
+\E \Bigg[
+g(x + \sigma Z)
+\prod_{j=1}^{d}
+\frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
+\Bigg],
+\end{align}
+%
+where $Z \sim \cN(0, I_d)$.
+Recalling that $|g(x)| \leq 1$ and applying the Cauchy--Schwarz inequality,
+%
+\begin{align*}
+\left|
+\partial^\kappa
+f(x)
+\right|
+&\leq
+\frac{1}{\sigma^{|\kappa|}}
+\prod_{j=1}^{d}
+\E \left[
+\left(
+\frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
+\right)^2
+\right]^{1/2}
+\leq
+\frac{1}{\sigma^{|\kappa|}}
+\prod_{j=1}^{d}
+\sqrt{\kappa_j!}
+=
+\frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}},
+\end{align*}
+%
+as the expected square of the Hermite polynomial of degree
+$\kappa_j$ against the standard Gaussian measure is $\kappa_j!$. By the
+reverse triangle inequality, $|g(x + t y) - g(x)| \leq t \|y\|_p / \eta$,
+so by \eqref{eq:yurinskii_app_smoothing_derivative},
+%
+\begin{align*}
+&\left|
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\big(
+\partial^{\kappa} f(x + t y)
+- \partial^{\kappa} f(x)
+\big)
+\right| \\
+&\quad=
+\left|
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\frac{1}{\sigma^{|\kappa|}}
+\E \Bigg[
+\big(
+g(x + t y + \sigma Z)
+- g(x + \sigma Z)
+\big)
+\prod_{j=1}^{d}
+\frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
+\Bigg]
+\right| \\
+&\quad\leq
+\frac{t \|y\|_p}{\sigma^k \eta}
+\, \E \left[
+\Bigg|
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\prod_{j=1}^{d}
+\frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)}
+\Bigg|
+\right].
+\end{align*}
+%
+Therefore, by the Cauchy--Schwarz inequality,
+%
+\begin{align*}
+&\Bigg(
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\big(
+\partial^{\kappa} f(x + t y)
+- \partial^{\kappa} f(x)
+\big)
+\Bigg)^2
+\leq
+\frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2}
+\, \E \left[
+\Bigg(
+\sum_{|\kappa|=k}
+\frac{y^\kappa}{\kappa!}
+\prod_{j=1}^{d}
+\frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)}
+\Bigg)^2
+\right] \\
+&\quad=
+\frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2}
+\sum_{|\kappa|=k}
+\sum_{|\kappa'|=k}
+\frac{y^{\kappa + \kappa'}}{\kappa! \kappa'!}
+\prod_{j=1}^{d}
+\, \E \left[
+\frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)}
+\frac{\partial^{\kappa'_j} \phi(Z_j)}{\phi(Z_j)}
+\right].
+\end{align*}
+%
+Orthogonality of Hermite polynomials gives zero if
+$\kappa_j \neq \kappa'_j$. By the multinomial theorem,
+%
+\begin{align*}
+\left|
+f(x + y)
+- \sum_{|\kappa|=0}^{k}
+\frac{1}{\kappa!}
+\partial^{\kappa} f(x)
+y^\kappa
+\right|
+&\leq
+\frac{\|y\|_p}{\sigma^k \eta}
+\Bigg(
+\sum_{|\kappa|=k}
+\frac{y^{2 \kappa}}{\kappa!}
+\Bigg)^{1/2}
+\leq
+\frac{\|y\|_p}{\sigma^k \eta \sqrt{k!}}
+\Bigg(
+\sum_{|\kappa|=k}
+\frac{k!}{\kappa!}
+y^{2 \kappa}
+\Bigg)^{1/2} \\
+&\leq
+\frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}.
+\end{align*}
+%
+For the final result, since
+$f(x) = \E \left[ g(x + \sigma Z) \right]$ and
+$\I\big\{x \in A^\eta\big\}\leq g(x)\leq \I\big\{x \in A^{2\eta}\big\}$,
+%
+\begin{align*}
+f(x)
+&\leq
+\P \left( x + \sigma Z \in A^{2 \eta} \right) \\
+&\leq
+\P \left( \|Z\|_p > \frac{\eta}{\sigma} \right)
++ \I \left\{ x \in A^{3 \eta} \right\}
+\P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right)
+= \varepsilon
++ (1 - \varepsilon) \I \left\{ x \in A^{3 \eta} \right\}, \\
+f(x)
+&\geq
+\P \left( x + \sigma Z \in A^{\eta} \right)
+\geq
+\I \left\{ x \in A \right\}
+\P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right)
+= (1 - \varepsilon) \I \left\{ x \in A \right\}.
+\end{align*}
+%
+\end{proof}
+
+We provide a useful Gaussian inequality in
+Lemma~\ref{lem:yurinskii_app_gaussian_useful}
+which helps bound the $\beta_{\infty,k}$ moment terms appearing in several
+places throughout the analysis.
+
+\begin{lemma}[A useful Gaussian inequality]%
+\label{lem:yurinskii_app_gaussian_useful}
+
+Let $X \sim \cN(0, \Sigma)$
+where $\sigma_j^2 = \Sigma_{j j} \leq \sigma^2$ for all $1 \leq j \leq d$.
+Then
+%
+\begin{align*}
+\E\left[
+\|X\|_2^2
+\|X\|_\infty
+\right]
+&\leq
+4 \sigma \sqrt{\log 2d}
+\,\sum_{j=1}^d \sigma_j^2
+&&\text{and}
+&\E\left[
+\|X\|_2^3
+\|X\|_\infty
+\right]
+&\leq
+8 \sigma \sqrt{\log 2d}
+\,\bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^{3/2}.
+\end{align*}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_useful}]
+
+By Cauchy--Schwarz, with $k \in \{2,3\}$, we have
+$\E\left[\|X\|_2^{k} \|X\|_\infty \right]
+\leq \E\big[\|X\|_2^{2k} \big]^{1/2} \E\big[\|X\|_\infty^2 \big]^{1/2}$.
+For the first term, by H{\"o}lder's inequality and the even
+moments of the normal distribution,
+%
+\begin{align*}
+\E\big[\|X\|_2^4 \big]
+&=
+\E\Bigg[
+\bigg(
+\sum_{j=1}^d X_j^2
+\bigg)^2
+\Bigg]
+=
+\sum_{j=1}^d \sum_{k=1}^d
+\E\big[
+X_j^2 X_k^2
+\big]
+\leq
+\bigg(
+\sum_{j=1}^d
+\E\big[X_j^4 \big]^{\frac{1}{2}}
+\bigg)^2
+=
+3 \bigg(
+\sum_{j=1}^d
+\sigma_j^2
+\bigg)^2, \\
+\E\big[\|X\|_2^6 \big]
+&=
+\sum_{j=1}^d \sum_{k=1}^d \sum_{l=1}^d
+\E\big[
+X_j^2 X_k^2 X_l^2
+\big]
+\leq
+\bigg(
+\sum_{j=1}^d
+\E\big[X_j^6 \big]^{\frac{1}{3}}
+\bigg)^3
+=
+15 \bigg(
+\sum_{j=1}^d
+\sigma_j^2
+\bigg)^3.
+\end{align*}
+%
+For the second term, by Jensen's inequality and the $\chi^2$ moment
+generating function,
+%
+\begin{align*}
+\E\big[\|X\|_\infty^2 \big]
+&=
+\E\left[
+\max_{1 \leq j \leq d}
+X_j^2
+\right]
+\leq
+4 \sigma^2
+\log
+\sum_{j=1}^d
+\E\Big[
+e^{X_j^2 / (4\sigma^2)}
+\Big]
+\leq
+4 \sigma^2
+\log
+\sum_{j=1}^d
+\sqrt{2}
+\leq
+4 \sigma^2
+\log 2 d.
+\end{align*}
+%
+\end{proof}
+
+We provide an $\ell^p$-norm tail probability bound for Gaussian variables in
+Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, motivating the definition of the
+term
+$\phi_p(d)$.
+
+\begin{lemma}[Gaussian \texorpdfstring{$\ell^p$}{lp}-norm bound]%
+\label{lem:yurinskii_app_gaussian_pnorm}
+Let $X \sim \cN(0, \Sigma)$ where $\Sigma \in \R^{d \times d}$
+is a positive semi-definite matrix. Then we have that
+$\E\left[ \|X\|_p \right] \leq
+\phi_p(d) \max_{1 \leq j \leq d} \sqrt{\Sigma_{j j}}$
+with $\phi_p(d) = \sqrt{pd^{2/p} }$ for $p \in [1,\infty)$
+and $\phi_\infty(d) = \sqrt{2\log 2d}$.
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}]
+
+For $p \in [1, \infty)$,
+as each $X_j$ is Gaussian, we have
+$\big(\E\big[|X_j|^p\big]\big)^{1/p}
+\leq \sqrt{p\, \E[X_j^2]}
+= \sqrt{p \Sigma_{j j}}$.
+So
+%
+\begin{align*}
+\E\big[\|X\|_p\big]
+&\leq
+\Bigg(\sum_{j=1}^d \E \big[ |X_j|^p \big] \Bigg)^{1/p}
+\leq \Bigg(\sum_{j=1}^d p^{p/2} \Sigma_{j j}^{p/2} \Bigg)^{1/p}
+\leq \sqrt{p d^{2/p}}
+\max_{1\leq j\leq d}
+\sqrt{\Sigma_{j j}}
+\end{align*}
+%
+by Jensen's inequality.
+For $p=\infty$,
+with $\sigma^2 = \max_j \Sigma_{j j}$,
+for $t>0$,
+%
+\begin{align*}
+\E\big[\|X\|_\infty \big]
+&\leq
+t
+\log
+\sum_{j=1}^d
+\E\Big[
+e^{|X_j| / t}
+\Big]
+\leq
+t
+\log
+\sum_{j=1}^d
+\E\Big[
+2 e^{X_j / t}
+\Big]
+\leq t \log \Big(2 d e^{\sigma^2/(2t^2)}\Big)
+\leq t \log 2 d + \frac{\sigma^2}{2t},
+\end{align*}
+%
+again by Jensen's inequality.
+Setting $t = \frac{\sigma}{\sqrt{2 \log 2d}}$ gives
+$\E\big[\|X\|_\infty \big] \leq \sigma \sqrt{2 \log 2d}$.
+%
+\end{proof}
+
+We give a Gaussian--Gaussian $\ell^p$-norm approximation
+as Lemma~\ref{lem:yurinskii_app_feasible_gaussian}, useful for
+ensuring approximations remain valid upon substituting
+an estimator for the true variance matrix.
+
+\begin{lemma}[Gaussian--Gaussian approximation in
+\texorpdfstring{$\ell^p$}{lp}-norm]%
+\label{lem:yurinskii_app_feasible_gaussian}
+
+Let $\Sigma_1, \Sigma_2 \in \R^{d \times d}$ be positive semi-definite
+and take $Z \sim \cN(0, I_d)$.
+For $p \in [1, \infty]$ we have
+%
+\begin{align*}
+\P\left(
+\left\|
+\left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z
+\right\|_p
+> t
+\right)
+&\leq
+2 d \exp \left(
+\frac{-t^2}
+{2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2}
+\right).
+\end{align*}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_feasible_gaussian}]
+
+Let $\Sigma \in \R^{d \times d}$ be positive semi-definite
+and write $\sigma^2_j = \Sigma_{j j} $.
+For $p \in [1, \infty)$ by a union bound and
+Gaussian tail probabilities,
+%
+\begin{align*}
+\P\left(\big\| \Sigma^{1/2} Z \big\|_p > t \right)
+&=
+\P\Bigg(
+\sum_{j=1}^d
+\left|
+\left(
+\Sigma^{1/2} Z
+\right)_j
+\right|^p
+> t^p \Bigg)
+\leq
+\sum_{j=1}^d
+\P\Bigg(
+\left|
+\left(
+\Sigma^{1/2} Z
+\right)_j
+\right|^p
+> \frac{t^p \sigma_j^p}{\|\sigma\|_p^p}
+\Bigg) \\
+&=
+\sum_{j=1}^d
+\P\Bigg(
+\left|
+\sigma_j Z_j
+\right|^p
+> \frac{t^p \sigma_j^p}{\|\sigma\|_p^p}
+\Bigg)
+=
+\sum_{j=1}^d
+\P\left(
+\left| Z_j \right|
+> \frac{t}{\|\sigma\|_p}
+\right)
+\leq
+2 d \, \exp\left( \frac{-t^2}{2 \|\sigma\|_p^2} \right).
+\end{align*}
+%
+The same result holds for $p = \infty$ since
+%
+\begin{align*}
+\P\left(\big\| \Sigma^{1/2} Z \big\|_\infty > t \right)
+&=
+\P\left(
+\max_{1 \leq j \leq d}
+\left|
+\left(
+\Sigma^{1/2} Z
+\right)_j
+\right|
+> t \right)
+\leq
+\sum_{j=1}^d
+\P\left(
+\left|
+\left(
+\Sigma^{1/2} Z
+\right)_j
+\right|
+> t
+\right) \\
+&=
+\sum_{j=1}^d
+\P\left(
+\left|
+\sigma_j Z_j
+\right|
+> t
+\right)
+\leq
+2 \sum_{j=1}^d
+\exp\left( \frac{-t^2}{2 \sigma_j^2} \right)
+\leq
+2 d
+\exp\left( \frac{-t^2}{2 \|\sigma\|_\infty^2} \right).
+\end{align*}
+%
+Now we apply this to the matrix
+$\Sigma = \big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$.
+For $p \in [1, \infty)$,
+%
+\begin{align*}
+\|\sigma\|_p^p
+&=
+\sum_{j=1}^d (\Sigma_{j j})^{p/2}
+=
+\sum_{j=1}^d
+\Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2}
+\leq
+d \max_{1 \leq j \leq d}
+\Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \\
+&\leq
+d \, \Big\|\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big\|_2^{p/2}
+=
+d \, \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^p
+\end{align*}
+%
+Similarly, for $p = \infty$ we have
+%
+\begin{align*}
+\|\sigma\|_\infty
+&=
+\max_{1 \leq j \leq d}
+(\Sigma_{j j})^{1/2}
+=
+\max_{1 \leq j \leq d}
+\Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{1/2}
+\leq
+\big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2.
+\end{align*}
+%
+Thus for all $p \in [1, \infty]$ we have
+$\|\sigma\|_p \leq
+d^{1/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2$,
+with $d^{1/\infty} = 1$. Hence
+%
+\begin{align*}
+\P\left(
+\left\|
+\left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z
+\right\|_p
+> t
+\right)
+&\leq
+2 d \exp \left( \frac{-t^2}{2 \|\sigma\|_p^2} \right)
+\leq
+2 d \exp \left(
+\frac{-t^2}
+{2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2}
+\right).
+\end{align*}
+%
+\end{proof}
+
+We give a variance bound and an exponential inequality for $\alpha$-mixing
+variables.
+
+\begin{lemma}[Variance bounds for
+\texorpdfstring{$\alpha$}{alpha}-mixing random variables]
+\label{lem:yurinskii_app_variance_mixing}
+
+Let $X_1, \ldots, X_n$ be
+real-valued $\alpha$-mixing random
+variables with mixing coefficients $\alpha(j)$.
+Then
+%
+\begin{enumerate}[label=(\roman*)]
+
+\item
+\label{it:yurinskii_app_variance_mixing_bounded}
+If for constants $M_i$ we have
+$|X_i| \leq M_i$ a.s.\ then
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+&\leq
+4 \sum_{j=1}^\infty \alpha(j)
+\sum_{i=1}^n M_i^2.
+\end{align*}
+
+\item
+\label{it:yurinskii_app_variance_mixing_exponential}
+If $\alpha(j) \leq e^{-2j / C_\alpha}$ then
+for any $r>2$ there is a constant
+$C_r$ depending only on $r$ with
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+&\leq
+C_r C_\alpha
+\sum_{i=1}^n
+\E\big[|X_i|^r\big]^{2/r}.
+\end{align*}
+\end{enumerate}
+%
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_variance_mixing}]
+
+Define
+$\alpha^{-1}(t) =
+\inf\{j \in \N : \alpha(j) \leq t\}$
+and $Q_i(t) = \inf\{s \in \R : \P(|X_i| > s) \leq t\}$.
+By Corollary~1.1 in \citet{rio2017asymptotic}
+and H{\"o}lder's inequality for $r > 2$,
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+&\leq
+4 \sum_{i=1}^n
+\int_0^1 \alpha^{-1}(t)
+Q_i(t)^2 \diff{t} \\
+&\leq
+4 \sum_{i=1}^n
+\left(
+\int_0^1 \alpha^{-1}(t)^{\frac{r}{r-2}} \diff{t}
+\right)^{\frac{r-2}{r}}
+\left(
+\int_0^1 |Q_i(t)|^r \diff{t}
+\right)^{\frac{2}{r}}
+\diff{t}.
+\end{align*}
+%
+Now note that if $U \sim \Unif[0,1]$ then
+$Q_i(U)$ has the same distribution as $X_i$.
+Therefore
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+&\leq
+4
+\left(
+\int_0^1 \alpha^{-1}(t)^{\frac r{r-2}} \diff{t}
+\right)^{\frac{r-2}r}
+\sum_{i=1}^n
+\E[|X_i|^r]^{\frac 2 r}.
+\end{align*}
+%
+If $\alpha(j) \leq e^{-2j/C_\alpha}$ then
+$\alpha^{-1}(t) \leq \frac{-C_\alpha \log t}{2}$
+so, for some constant
+$C_r$ depending only on $r$,
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+\leq
+2 C_\alpha
+\left(
+\int_0^1 (-\log t)^{\frac r{r-2}} \diff{t}
+\right)^{\frac{r-2} r}
+\sum_{i=1}^n
+\E[|X_i|^r]^{\frac 2 r}
+\leq
+C_r C_\alpha
+\sum_{i=1}^n
+\E[|X_i|^r]^{\frac 2 r}.
+\end{align*}
+%
+Alternatively, if for constants $M_i$ we have
+$|X_i| \leq M_i$ a.s.\ then
+%
+\begin{align*}
+\Var\left[
+\sum_{i=1}^n X_i
+\right]
+&\leq
+4 \int_0^1 \alpha^{-1}(t)
+\diff{t}
+\sum_{i=1}^n M_i^2
+\leq
+4 \sum_{j=1}^\infty \alpha(j)
+\sum_{i=1}^n M_i^2.
+\end{align*}
+%
+\end{proof}
+
+\begin{lemma}[Exponential concentration inequalities for
+\texorpdfstring{$\alpha$}{alpha}-mixing random variables]
+\label{lem:yurinskii_app_exponential_mixing}
+
+Let $X_1, \ldots, X_n$ be zero-mean real-valued
+variables with $\alpha$-mixing coefficients
+$\alpha(j) \leq e^{-2 j / C_\alpha}$.
+
+\begin{enumerate}[label=(\roman*)]
+
+\item
+\label{it:yurinskii_app_exponential_mixing_bounded}
+Suppose $|X_i| \leq M$ a.s.\ for $1 \leq i \leq n$.
+Then for all $t > 0$ there is a constant $C_1$ with
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n
+X_i
+\right|
+> C_1 M \big( \sqrt{n t}
++ (\log n)(\log \log n) t \big)
+\right)
+&\leq
+C_1 e^{-t}.
+\end{align*}
+%
+\item
+\label{it:yurinskii_app_exponential_mixing_bernstein}
+If further $\sum_{j=1}^n |\Cov[X_i, X_j]| \leq \sigma^2$,
+then for all $t > 0$ there is a constant $C_2$ with
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n
+X_i
+\right|
+\geq C_2 \big( (\sigma \sqrt n + M) \sqrt t
++ M (\log n)^2 t \big)
+\right)
+&\leq
+C_2 e^{-t}.
+\end{align*}
+
+\end{enumerate}
+
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_exponential_mixing}]
+
+\begin{enumerate}[label=(\roman*)]
+
+\item
+By Theorem~1 in \citet{merlevede2009bernstein},
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n
+X_i
+\right|
+> t
+\right)
+&\leq
+\exp\left(
+-\frac{C_1 t^2}{n M^2 + Mt (\log n)(\log\log n)}
+\right).
+\end{align*}
+%
+Replace $t$ by
+$M \sqrt{n t} + M (\log n)(\log \log n) t$.
+
+\item
+By Theorem~2 in \citet{merlevede2009bernstein},
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n
+X_i
+\right|
+> t
+\right)
+&\leq
+\exp\left(
+-\frac{C_2 t^2}{n\sigma^2 + M^2 + Mt (\log n)^2}
+\right).
+\end{align*}
+%
+Replace $t$ by
+$\sigma \sqrt n \sqrt t + M \sqrt t + M (\log n)^2 t$.
+\end{enumerate}
+%
+\end{proof}
+
+\subsection{Main results}
+
+To establish Theorem~\ref{thm:yurinskii_sa_dependent}, we first
+give the analogous result
+for martingales as Lemma~\ref{lem:yurinskii_app_sa_martingale}. Our approach is
+similar to
+that used in modern versions of Yurinskii's coupling for independent data, as
+in Theorem~1 in \citet{lecam1988} and Theorem~10 in Chapter~10 of
+\citet{pollard2002user}. The proof of
+Lemma~\ref{lem:yurinskii_app_sa_martingale} relies on
+constructing a ``modified'' martingale, which is close to the original
+martingale, but which has an $\cH_0$-measurable terminal quadratic variation.
+
+\begin{lemma}[Strong approximation for vector-valued martingales]%
+\label{lem:yurinskii_app_sa_martingale}
+
+Let $X_1, \ldots, X_n$ be $\R^d$-valued
+square-integrable random vectors
+adapted to a countably generated
+filtration $\cH_0, \ldots, \cH_n$.
+Suppose that
+$\E[X_i \mid \cH_{i-1}] = 0$ for all $1 \leq i \leq n$
+and define $S = \sum_{i=1}^n X_i$.
+Let $V_i = \Var[X_i \mid \cH_{i-1}]$ and
+$\Omega = \sum_{i=1}^n V_i - \Sigma$
+where $\Sigma$ is a positive semi-definite
+$\cH_0$-measurable $d \times d$ random matrix.
+For each $\eta > 0$ and $p \in [1,\infty]$
+there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with
+%
+\begin{align*}
+\P\big(\|S-T\|_p > 5\eta\big)
+&\leq
+\inf_{t>0}
+\left\{
+2 \P\big( \|Z\|_p > t \big)
++ \min\left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\} \\
+\nonumber
+&\quad+
+\inf_{M \succeq 0}
+\big\{ 2\gamma(M) + \delta_p(M,\eta)
++ \varepsilon_p(M, \eta)\big\},
+\end{align*}
+%
+where the second infimum is over all positive semi-definite
+$d \times d$ non-random matrices, and
+%
+\begin{align*}
+\beta_{p,k}
+&=
+\sum_{i=1}^n \E\left[\| X_i \|^k_2 \| X_i \|_p
++ \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right],
+\qquad\gamma(M)
+= \P\big(\Omega \npreceq M\big), \\
+\delta_p(M,\eta)
+&=
+\P\left(
+\big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
+\geq \eta
+\right),
+\qquad\pi_3
+=
+\sum_{i=1}^{n+m}
+\sum_{|\kappa| = 3}
+\E \Big[ \big|
+\E \left[ X_i^\kappa \mid \cH_{i-1} \right]
+\big| \Big], \\
+\varepsilon_p(M, \eta)
+&=
+\P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
+\Omega \preceq M\right),
+\end{align*}
+%
+for $k \in \{2,3\}$, with $Z, Z_1,\dots ,Z_n$ i.i.d.\ standard Gaussian
+on $\R^d$ independent of $\cH_n$.
+\end{lemma}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_app_sa_martingale}]
+
+\proofparagraph{constructing a modified martingale}
+
+Take $M \succeq 0$ a fixed positive semi-definite
+$d \times d$ matrix.
+We start by constructing a new martingale based on $S$
+whose quadratic variation is $\Sigma + M$.
+Take $m \geq 1$ and define
+%
+\begin{align*}
+H_k
+&=
+\Sigma
++ M
+- \sum_{i=1}^{k} V_i,
+\qquad\qquad\qquad\qquad\tau
+=
+\sup \big\{ k\in\{0,1,\dots,n\} : H_k \succeq 0 \big\}, \\
+\tilde X_i
+&=
+X_i\I\{i \leq \tau\}
++ \frac{1}{\sqrt{m}} H_\tau^{1/2} Z_i\I\{n+1 \leq i \leq n+m\},
+\qquad\qquad\tilde S
+=
+\sum_{i=1}^{n+m} \tilde X_i,
+\end{align*}
+%
+where $Z_{n+1}, \ldots, Z_{n+m}$ is an i.i.d.\
+sequence of standard Gaussian vectors in $\R^d$
+independent of $\cH_n$,
+noting that $H_0 = \Sigma + M \succeq 0$ a.s.
+Define the filtration
+$\tilde \cH_0, \ldots, \tilde \cH_{n+m}$,
+where $\tilde \cH_i = \cH_i$ for $0 \leq i \leq n$
+and is the $\sigma$-algebra generated by
+$\cH_n$ and $Z_{n+1}, \dots, Z_{i}$ for $n+1 \leq i\leq n+m$.
+Observe that $\tau$ is a stopping time with respect to $\tilde\cH_i$
+because $H_{i+1} - H_i = -V_{i+1} \preceq 0$ almost surely,
+so $\{\tau \leq i\} = \{H_{i+1} \nsucceq 0\}$ for $0\leq i<n$.
+This depends only on $V_1, \dots, V_{i+1}$ and $\Sigma$
+which are $\tilde\cH_i$-measurable.
+Similarly, $\{\tau = n\} = \{H_n \succeq 0\} \in \tilde\cH_{n-1}$.
+Let $\tilde V_i = V_i \I\{i\leq\tau\}$ for
+$1\leq i\leq n$ and
+$\tilde V_i = H_\tau/m$ for $n+1\leq i\leq n+m$.
+Note that $\tilde X_i$ is $\tilde \cH_i$-measurable
+and $\tilde V_i$ is $\tilde \cH_{i-1}$-measurable.
+Further, $\E \left[ \tilde X_i \mid \tilde \cH_{i-1} \right] = 0$ and
+$\E \left[ \tilde X_i \tilde X_i^\T \mid \tilde \cH_{i-1} \right]
+= \tilde V_i$.
+
+\proofparagraph{bounding the difference between the original and
+modified martingales}
+
+By the triangle inequality,
+%
+\begin{align*}
+\|S - \tilde S \|_p
+&\leq
+\left\| \sum_{i=\tau+1}^n X_i \right\|_p
++ \left\| \frac{1}{\sqrt{m}} \sum_{i=n+1}^m H_\tau^{1/2} Z_i \right\|_p.
+\end{align*}
+%
+The first term on the right vanishes on
+$\{\tau = n\} = \{H_n \succeq 0\} = \{\Omega \preceq M\}$.
+For the second term, note that
+$\tfrac{1}{\sqrt{m}}\sum_{i=n+1}^m H_\tau^{1/2} Z_i$
+is distributed as $H_\tau^{1/2}Z$,
+where $Z$ is an independent standard Gaussian variable.
+Also
+$\P\big( \| H_\tau^{1/2} Z \|_p > \eta \big)
+\leq \P\big( \| H_n^{1/2} Z \|_p > \eta,\, \Omega \preceq M)
++ \P\big( \Omega \npreceq M \big)$,
+so
+%
+\begin{align*}%
+\label{eq:yurinskii_app_approx_modified_original}
+\P\big( \| S - \tilde S \|_p > \eta\big)
+&\leq
+2 \P\big(\Omega \npreceq M \big)
++ \P\big( \| (M-\Omega)^{1/2}Z \|_p > \eta,\,
+\Omega \preceq M \big)
+= 2 \gamma(M) + \varepsilon_p(M, \eta).
+\end{align*}
+
+\proofparagraph{strong approximation of the modified martingale}
+
+Let $\tilde Z_1, \ldots, \tilde Z_{n+m}$ be i.i.d.\ $\cN(0, I_d)$
+and independent of $\tilde \cH_{n+m}$.
+Define $\check X_i = \tilde V_i^{1/2} \tilde Z_i$
+and $\check S = \sum_{i=1}^{n+m} \check X_i$.
+Fix a Borel set $A \subseteq \R^d$ and $\sigma, \eta > 0$ and
+let $f = f_{A\eta\sigma}$ be the function defined in
+Lemma~\ref{lem:yurinskii_app_smooth_approximation}.
+By the Lindeberg method, write the telescoping sum
+%
+\begin{align*}
+\E\Big[f\big(\tilde S\big) - f\big(\check S\big)
+\mid \cH_0 \Big]
+&=
+\sum_{i=1}^{n+m}
+\E\Big[ f\big(Y_i + \tilde X_i\big)
+- f\big(Y_i + \check X_i\big)
+\mid \cH_0 \Big]
+\end{align*}
+%
+where
+$Y_i = \sum_{j=1}^{i-1} \tilde X_j + \sum_{j=i+1}^{n+m} \check X_j$.
+By Lemma~\ref{lem:yurinskii_app_smooth_approximation} we have for $k \geq 0$
+%
+\begin{align*}
+&\Bigg|
+\E\big[
+f(Y_i + \tilde X_i)
+- f(Y_i + \check X_i)
+\mid \cH_0
+\big]
+- \sum_{|\kappa| = 0}^k
+\frac{1}{\kappa!}
+\E \left[
+\partial^\kappa f(Y_i)
+\left( \tilde X_i^\kappa - \check X_i^\kappa \right)
+\bigm| \cH_0
+\right]
+\Bigg| \\
+&\quad\leq
+\frac{1}{\sigma^k \eta \sqrt{k!}}
+\E \left[
+\|\tilde X_i\|_p \|\tilde X_i\|_2^k
++ \|\check X_i\|_p \|\check X_i\|_2^k
+\bigm| \cH_0
+\right].
+\end{align*}
+%
+With $k \in \{2, 3\}$, we bound each summand.
+With $|\kappa| = 0$ we have
+$\tilde X_i^\kappa = \check X_i^\kappa$,
+so consider $|\kappa| = 1$.
+Noting that $\sum_{i=1}^{n+m} \tilde V_i = \Sigma + M$, define
+%
+\begin{align*}
+\tilde Y_i
+&=
+\sum_{j=1}^{i-1} \tilde X_j
++ \tilde Z_i
+\Bigg(\sum_{j=i+1}^{n+m} \tilde V_j\Bigg)^{1/2}
+=
+\sum_{j=1}^{i-1} \tilde X_j
++ \tilde Z_i
+\Bigg(\Sigma + M - \sum_{j=1}^{i} \tilde V_j\Bigg)^{1/2}
+\end{align*}
+%
+and let $\check \cH_i$ be the $\sigma$-algebra generated by
+$\tilde \cH_{i-1}$ and $\tilde Z_i$.
+Note that $\tilde Y_i$ is $\check \cH_i$-measurable
+and that $Y_i$ and $\tilde Y_i$
+have the same distribution conditional on $\tilde \cH_{n+m}$. So
+%
+\begin{align*}
+&\sum_{|\kappa| = 1}
+\frac{1}{\kappa!}
+\E\left[
+\partial^\kappa f(Y_i)
+\big( \tilde X_i^\kappa - \check X_i^\kappa \big)
+\bigm| \cH_0
+\right]
+= \E \left[
+\nabla f(Y_i)^\T
+\big( \tilde X_i - \tilde V_i^{1/2} \tilde Z_i \big)
+\bigm| \cH_0
+\right] \\
+&\quad=
+\E \left[
+\nabla f(\tilde Y_i)^\T \tilde X_i
+\bigm| \cH_0
+\right]
+- \E \left[
+\nabla f(Y_i)^\T \tilde V_i^{1/2} \tilde Z_i
+\bigm| \cH_0
+\right] \\
+&\quad=
+\E \left[
+\nabla f(\tilde Y_i)^\T
+\E \left[
+\tilde X_i
+\mid \check \cH_i
+\right]
+\bigm| \cH_0
+\right]
+- \E \left[
+\tilde Z_i
+\right]
+\E \left[
+\nabla f(Y_i)^\T \tilde V_i^{1/2}
+\bigm| \cH_0
+\right] \\
+&\quad=
+\E \left[
+\nabla f(\tilde Y_i)^\T
+\E \left[
+\tilde X_i
+\mid \tilde \cH_{i-1}
+\right]
+\bigm| \cH_0
+\right]
+- 0
+= 0.
+\end{align*}
+%
+Next, if $|\kappa| = 2$ then
+%
+\begin{align*}
+&\sum_{|\kappa| = 2}
+\frac{1}{\kappa!}
+\E \left[
+\partial^\kappa f(Y_i)
+\left( \tilde X_i^\kappa - \check X_i^\kappa \right)
+\bigm| \cH_0
+\right] \\
+&\quad=
+\frac{1}{2}
+\E \left[
+\tilde X_i^\T \nabla^2 f(Y_i) \tilde X_i
+- \tilde Z_i^\T \tilde V_i^{1/2} \nabla^2 f(Y_i)
+\tilde V_i^{1/2} \tilde Z_i
+\bigm| \cH_0
+\right] \\
+&\quad=
+\frac{1}{2}
+\E \left[
+\E \left[
+\Tr \nabla^2 f(\tilde Y_i) \tilde X_i \tilde X_i^\T
+\bigm| \check \cH_i
+\right]
+\bigm| \cH_0
+\right]
+- \frac{1}{2}
+\E \left[
+\Tr \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2}
+\bigm| \cH_0
+\right]
+\E \left[
+\tilde Z_i \tilde Z_i^\T
+\right] \\
+&\quad=
+\frac{1}{2}
+\E \left[
+\Tr \nabla^2 f(Y_i)
+\E \left[
+\tilde X_i \tilde X_i^\T
+\bigm| \tilde \cH_{i-1}
+\right]
+\bigm| \cH_0
+\right]
+- \frac{1}{2}
+\E \left[
+\Tr \nabla^2 f(Y_i) \tilde V_i
+\bigm| \cH_0
+\right]
+= 0.
+\end{align*}
+%
+Finally, if $|\kappa| = 3$, then since
+$\check X_i \sim \cN(0, \tilde V_i)$
+conditional on $\tilde \cH_{n+m}$, we have by symmetry of the Gaussian
+distribution and Lemma~\ref{lem:yurinskii_app_smooth_approximation},
+%
+\begin{align*}
+&
+\left|
+\sum_{|\kappa| = 3}
+\frac{1}{\kappa!}
+\E \left[
+\partial^\kappa f(Y_i)
+\left( \tilde X_i^\kappa - \check X_i^\kappa \right)
+\bigm| \cH_0
+\right]
+\right|
+\\
+&\quad=
+\left|
+\sum_{|\kappa| = 3}
+\frac{1}{\kappa!}
+\left(
+\E \left[
+\partial^\kappa f(\tilde Y_i)
+\E \left[ \tilde X_i^\kappa \mid \check \cH_i \right]
+\bigm| \cH_0
+\right]
+- \E \left[
+\partial^\kappa f(Y_i) \,
+\E \left[
+\check X_i^\kappa
+\bigm| \tilde \cH_{n+m}
+\right]
+\bigm| \cH_0
+\right]
+\right)
+\right|
+\\
+&\quad=
+\left|
+\sum_{|\kappa| = 3}
+\frac{1}{\kappa!}
+\E \left[
+\partial^\kappa f(Y_i) \,
+\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
+\bigm| \cH_0
+\right]
+\right|
+\leq
+\frac{1}{\sigma^3}
+\sum_{|\kappa| = 3}
+\E \left[
+\left|
+\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
+\right|
+\bigm| \cH_0
+\right].
+\end{align*}
+%
+Combining these and summing over $i$ with $k=2$ shows
+%
+\begin{align*}
+\E\left[
+f\big(\tilde S\big) - f\big(\check S\big)
+\bigm| \cH_0
+\right]
+&\leq
+\frac{1}{\sigma^2 \eta \sqrt{2}}
+\sum_{i=1}^{n+m}
+\E \left[
+\|\tilde X_i\|_p \|\tilde X_i\|_2^2
++ \|\check X_i\|_p \|\check X_i\|_2^2
+\bigm| \cH_0
+\right]
+\end{align*}
+%
+On the other hand, taking $k = 3$ gives
+%
+\begin{align*}
+\E\left[
+f\big(\tilde S\big) - f\big(\check S\big)
+\bigm| \cH_0
+\right]
+&\leq
+\frac{1}{\sigma^3 \eta \sqrt{6}}
+\sum_{i=1}^{n+m}
+\E \left[
+\|\tilde X_i\|_p \|\tilde X_i\|_2^3
++ \|\check X_i\|_p \|\check X_i\|_2^3
+\bigm| \cH_0
+\right] \\
+&\quad+
+\frac{1}{\sigma^3}
+\sum_{i=1}^{n+m}
+\sum_{|\kappa| = 3}
+\E \left[
+\left|
+\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right]
+\right|
+\bigm| \cH_0
+\right].
+\end{align*}
+%
+For $1 \leq i \leq n$ we have
+$\|\tilde X_i\| \leq \|X_i\|$
+and $\|\check X_i\| \leq \|V_i^{1/2} \tilde Z_i\|$.
+For $n+1 \leq i \leq n+m$ we have
+$\tilde X_i = H_\tau^{1/2} Z_i / \sqrt m$
+and $\check X_i = H_\tau^{1/2} \tilde Z_i / \sqrt m$
+which are equal in distribution given $\cH_0$.
+So with
+%
+\begin{align*}
+\tilde \beta_{p,k}
+&=
+\sum_{i=1}^{n}
+\E \left[
+\|X_i\|_p \|X_i\|_2^k
++ \|V_i^{1/2} Z_i\|_p \|V_i^{1/2} Z_i\|_2^k
+\bigm| \cH_0
+\right],
+\end{align*}
+%
+we have, since $k \in \{2,3\}$,
+%
+\begin{align*}
+&\sum_{i=1}^{n+m}
+\E \left[
+\|\tilde X_i\|_p \|\tilde X_i\|_2^k
++ \|\check X_i\|_p \|\check X_i\|_2^k
+\bigm| \cH_0
+\right]
+\leq
+\tilde\beta_{p,k}
++ \frac{2}{\sqrt m}
+\E \left[
+\|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k
+\bigm| \cH_0
+\right].
+\end{align*}
+%
+Since $H_i$ is weakly decreasing under the
+semi-definite partial order, we have
+$H_\tau \preceq H_0 = \Sigma + M$
+implying that $|(H_\tau)_{j j}| \leq \|\Sigma + M\|_{\max}$ and
+$\E\big[|(H_\tau^{1/2} Z)_j|^3 \mid \cH_0 \big]
+\leq \sqrt{8/\pi}\, \|\Sigma + M\|_{\max}^{3/2}$.
+Hence as $p \geq 1$ and $k \in \{2,3\}$,
+%
+\begin{align*}
+\E\left[
+\|H_\tau^{1/2}Z\|_p
+\|H_\tau^{1/2}Z\|_2^k
+\bigm| \cH_0
+\right]
+&\leq
+\E\left[\|H_\tau^{1/2} Z\|_1^{k+1}
+\bigm| \cH_0
+\right]
+\leq
+d^{k+1} \max_{1\leq j\leq d}
+\E\left[|(H_\tau^{1/2} Z)_j|^{k+1}
+\bigm| \cH_0
+\right] \\
+&\leq 3 d^4 \,
+\|\Sigma + M\|_{\max}^{(k+1)/2}
+\leq 6 d^4 \,
+\|\Sigma \|_{\max}^{(k+1)/2}
++ 6 d^4 \|M\|.
+\end{align*}
+%
+Assuming some $X_i$ is not identically zero so
+the result is non-trivial,
+and supposing that $\Sigma$ is bounded a.s.\
+(replacing $\Sigma$ by $\Sigma \cdot \I\{\|\Sigma\|_{\max} \leq C\}$
+for an appropriately large $C$ if necessary),
+take $m$ large enough that
+%
+\begin{align}
+\label{eq:yurinskii_app_bound_extra_terms}
+\frac{2}{\sqrt m}
+\E \left[
+\|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k
+\bigm| \cH_0
+\right]
+\leq
+\frac{1}{4}
+\beta_{p,k}.
+\end{align}
+%
+Further, if $|\kappa| = 3$ then
+$\big|\E \big[
+\tilde X_i^\kappa \mid \tilde \cH_{i-1} \big]\big|
+\leq \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right]\big|$
+for $1 \leq i \leq n$
+while by symmetry of the Gaussian distribution
+$\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] = 0$
+for $n+1 \leq i \leq n+m$.
+Hence with
+%
+\begin{align*}
+\tilde \pi_3
+&=
+\sum_{i=1}^{n+m}
+\sum_{|\kappa| = 3}
+\E \Big[ \big|
+\E \left[ X_i^\kappa \mid \cH_{i-1} \right]
+\big| \mid \cH_0 \Big],
+\end{align*}
+%
+we have
+%
+\begin{align*}
+\E\left[
+f\big(\tilde S\big) - f\big(\check S\big)
+\bigm| \cH_0
+\right]
+&\leq
+\min \left\{
+\frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta}
++ \frac{\beta_{p,2}}{4 \sigma^2 \eta},
+\frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta}
++ \frac{\beta_{p,3}}{4 \sigma^3 \eta}
++ \frac{\tilde \pi_3}{\sigma^3}
+\right\}.
+\end{align*}
+%
+Along with Lemma~\ref{lem:yurinskii_app_smooth_approximation}, and with
+$\sigma = \eta / t$ and $\varepsilon = \P(\|Z\|_p > t)$,
+we conclude that
+%
+\begin{align*}
+&\P(\tilde S \in A \mid \cH_0)
+=
+\E\big[\I\{\tilde S \in A\} - f(\tilde S)
+\mid \cH_0
+\big]
++ \E\big[f(\tilde S) - f\big(\check S\big)
+\mid \cH_0
+\big]
++ \E \big[f\big(\check S\big)
+\mid \cH_0
+\big] \\
+&\,\leq
+\varepsilon\P(\tilde S \in A
+\mid \cH_0)
++ \min \! \left\{
+\frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta}
++ \frac{\beta_{p,2}}{4 \sigma^2 \eta},
+\frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta}
++ \frac{\beta_{p,3}}{4 \sigma^3 \eta}
++ \frac{\tilde \pi_3}{\sigma^3}
+\right\}
++
+\varepsilon
++ (1 - \varepsilon) \P\big(\check S \in A_p^{3\eta}
+\mid \cH_0
+\big) \\
+&\,\leq
+\P\big( \check S \in A_p^{3\eta}
+\mid \cH_0
+\big)
++ 2 \P(\|Z\|_p > t)
++ \min\!\left\{
+\frac{3 \tilde \beta_{p,2} t^2}{4 \eta^3}
++ \frac{\beta_{p,2} t^2}{4 \eta^3},
+\frac{3 \tilde \beta_{p,3} t^3}{4 \eta^4}
++ \frac{\beta_{p,3} t^3}{4 \eta^4}
++ \frac{\tilde \pi_3 t^3}{\eta^3}
+\right\}.
+\end{align*}
+%
+Taking a supremum and an outer expectation yields
+with $\beta_{p,k} = \E\big[\tilde \beta_{p,k}\big]$
+and $\pi_3 = \E[\tilde \pi_3]$,
+%
+\begin{align*}
+&\E^* \left[
+\sup_{A \in \cB(\R^d)}
+\left\{
+\P(\tilde S \in A \mid \cH_0)
+- \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big)
+\right\}
+\right] \\
+&\quad\leq
+2 \P(\|Z\|_p > t)
++ \min \left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}.
+\end{align*}
+%
+Finally, since
+$\check S = \sum_{i=1}^n \tilde V_i^{1/2} \tilde Z_i
+\sim \cN(0,\Sigma + M)$ conditional on $\cH_0$,
+the conditional Strassen theorem
+in Lemma~\ref{lem:yurinskii_app_strassen}
+ensures the existence of $\tilde S$ and
+$\tilde T \mid \cH_0 \sim \cN(0, \Sigma + M)$
+such that
+%
+\begin{align}
+\label{eq:yurinskii_app_approx_modified_martingale}
+\P\left(\|\tilde S-\tilde T\|_p>3\eta\right)
+&\leq
+\inf_{t>0}
+\left\{
+2 \P(\|Z\|_p > t)
++ \min \left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\},
+\end{align}
+%
+since the infimum is attained by continuity of $\|Z\|_p$.
+
+\proofparagraph{conclusion}
+
+We show how to write
+$\tilde T = (\Sigma + M)^{1/2} W$
+where $W \sim \cN(0,I_d)$
+and use this representation to construct
+$T \mid \cH_0 \sim \cN(0, \Sigma)$.
+By the spectral theorem, let $\Sigma + M = U \Lambda U^\T$
+where $U$ is a $d \times d$ orthogonal random matrix
+and $\Lambda$ is a diagonal $d \times d$ random matrix with
+diagonal entries satisfying
+$\lambda_1 \geq \cdots \geq \lambda_r > 0$
+and $\lambda_{r+1} = \cdots = \lambda_d = 0$
+where $r = \rank (\Sigma + M)$.
+Let $\Lambda^+$ be the Moore--Penrose pseudo-inverse of $\Lambda$
+(obtained by inverting its non-zero elements) and define
+$W = U (\Lambda^+)^{1/2} U^\T \tilde T + U \tilde W$, where
+the first $r$ elements of $\tilde W$ are zero
+and the last $d-r$ elements are i.i.d.\ $\cN(0,1)$
+independent from $\tilde T$.
+Then, it is easy to check that
+$W \sim \cN(0, I_d)$ and that
+$\tilde T = (\Sigma + M)^{1/2} W$.
+Now define $T = \Sigma^{1/2} W$ so
+%
+\begin{equation}%
+\label{eq:yurinskii_app_approx_target}
+\P\big(\|T - \tilde T\|_p > \eta\big)
+= \P\big(\big\|\big((\Sigma + M)^{1/2}
+- \Sigma^{1/2} \big) W \big\|_p>\eta \big)
+= \delta_p(M, \eta).
+\end{equation}
+%
+Finally
+\eqref{eq:yurinskii_app_approx_modified_original},
+\eqref{eq:yurinskii_app_approx_modified_martingale},
+\eqref{eq:yurinskii_app_approx_target},
+the triangle inequality,
+and a union bound conclude the proof since
+by taking an infimum over $M \succeq 0$,
+and by possibly reducing the constant of $1/4$ in
+\eqref{eq:yurinskii_app_bound_extra_terms} to account for
+this infimum being potentially unattainable,
+%
+\begin{align*}
+\P\big(\|S-T\|_p > 5\eta\big)
+&\leq
+\P\big(\|\tilde S - \tilde T \|_p > 3\eta \big)
++\P\big(\|S - \tilde S \|_p > \eta\big)
++\P\big(\|T - \tilde T \|_p > \eta\big) \\
+&\leq
+\inf_{t>0}
+\left\{
+2 \P\big( \|Z\|_p > t \big)
++ \min\left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\} \\
+&\quad+
+\inf_{M \succeq 0}
+\big\{ 2\gamma(M) + \delta_p(M,\eta)
++ \varepsilon_p(M, \eta)\big\}.
+\end{align*}
+%
+\end{proof}
+
+Lemma~\ref{lem:yurinskii_app_sa_martingale} and the martingale approximation
+immediately yield Theorem~\ref{thm:yurinskii_sa_dependent}.
+
+\begin{proof}[Theorem~\ref{thm:yurinskii_sa_dependent}]
+Apply Lemma~\ref{lem:yurinskii_app_sa_martingale} to
+the martingale $\sum_{i=1}^{n} \tilde X_i$,
+noting that $S - \sum_{i=1}^{n} \tilde X_i = U$.
+\end{proof}
+
+Bounding the quantities
+in Theorem~\ref{thm:yurinskii_sa_dependent} gives a
+user-friendly version as Proposition~\ref{pro:yurinskii_sa_simplified}.
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_sa_simplified}]
+
+Set $M = \nu^2 I_d$ and
+bound the terms appearing
+the main inequality in Proposition~\ref{pro:yurinskii_sa_simplified}.
+
+\proofparagraph{bounding $\P( \|Z\|_p > t )$}
+
+By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
+we have
+$\P( \|Z\|_p > t ) \leq \E[\|Z\|_p] / t \leq \phi_p(d) / t$.
+
+\proofparagraph{bounding $\gamma(M)$}
+
+With $M = \nu^2 I_d$,
+by Markov's inequality,
+$\gamma(M) = \P\big(\Omega \npreceq M\big)
+= \P\big(\|\Omega\|_2 > \nu^2 \big)
+\leq \nu^{-2} \E[\|\Omega\|_2]$.
+
+\proofparagraph{bounding $\delta(M, \eta)$}
+
+By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
+using
+$\max_j |M_{j j}| \leq \|M\|_2$
+for $M \succeq 0$,
+%
+\begin{align*}
+\delta_{p}(M,\eta)
+&= \P\left(
+\big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p
+\geq \eta
+\right)
+\leq \frac{\phi_p(d)} {\eta}
+\E \left[
+\big\|(\Sigma +M)^{1/2}- \Sigma^{1/2}\big\|_2
+\right].
+\end{align*}
+%
+For semi-definite matrices
+the eigenvalue operator commutes with smooth matrix functions so
+%
+\begin{align*}
+\|(\Sigma +M)^{1/2}- \Sigma^{1/2}\|_2
+&=
+\max_{1 \leq j \leq d}
+\left|
+\sqrt{\lambda_j(\Sigma) + \nu^2} - \sqrt{\lambda_j(\Sigma)}
+\right|
+\leq \nu
+\end{align*}
+%
+and hence $\delta_{p}(M,\eta) \leq \phi_p(d)\nu / \eta$.
+
+\proofparagraph{bounding $\varepsilon(M, \eta)$}
+
+Note that $(M -\Omega)^{1/2}Z$ is a centered Gaussian
+conditional on $\cH_n$,
+on the event $\{\Omega \preceq M\}$.
+We thus have by Markov's inequality,
+Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
+and Jensen's inequality that
+%
+\begin{align*}
+\varepsilon_p(M, \eta)
+&= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \
+\Omega \preceq M\right)
+\leq
+\frac{1}{\eta}
+\E\left[
+\I\{\Omega \preceq M\}
+\E\left[
+\big\| (M - \Omega)^{1/2} Z \big\|_p
+\mid \cH_n
+\right]
+\right] \\
+&\leq
+\frac{\phi_p(d)}{\eta}
+\E\left[
+\I\{\Omega \preceq M\}
+\max_{1 \leq j \leq d}
+\sqrt{(M - \Omega)_{j j}}
+\right]
+\leq
+\frac{\phi_p(d)}{\eta}
+\E\left[
+\sqrt{\|M - \Omega\|_2}
+\right] \\
+&\leq
+\frac{\phi_p(d)}{\eta}
+\E\left[
+\sqrt{\|\Omega\|_2} + \nu
+\right]
+\leq
+\frac{\phi_p(d)}{\eta}
+\left(\sqrt{\E[\|\Omega\|_2]} + \nu \right).
+\end{align*}
+%
+Thus by Theorem~\ref{thm:yurinskii_sa_dependent} and the previous parts,
+%
+\begin{align*}
+\P\big(\|S-T\|_p > 6\eta\big)
+&\leq
+\inf_{t>0}
+\left\{
+2 \P\big(\|Z\|_p>t\big)
++ \min\left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\} \\
+&\quad+
+\inf_{M \succeq 0}
+\big\{ 2\gamma(M) + \delta_p(M,\eta)
++ \varepsilon_p(M, \eta)\big\}
++\P\big(\|U\|_p>\eta\big) \\
+&\leq
+\inf_{t>0}
+\left\{
+\frac{2 \phi_p(d)}{t}
++ \min\left\{
+\frac{\beta_{p,2} t^2}{\eta^3},
+\frac{\beta_{p,3} t^3}{\eta^4}
++ \frac{\pi_3 t^3}{\eta^3}
+\right\}
+\right\} \\
+&\quad+
+\inf_{\nu > 0}
+\left\{ \frac{2\E \left[ \|\Omega\|_2 \right]}{\nu^2}
++ \frac{2 \phi_p(d) \nu}{\eta}
+\right\}
++ \frac{\phi_p(d) \sqrt{\E \left[ \|\Omega\|_2 \right]}}{\eta}
++\P\big(\|U\|_p>\eta\big).
+\end{align*}
+%
+Set $t = 2^{1/3} \phi_p(d)^{1/3} \beta_{p,2}^{-1/3} \eta$
+and $\nu = \E[\|\Omega\|_2]^{1/3} \phi_p(d)^{-1/3} \eta^{1/3}$,
+then replace $\eta$ with $\eta / 6$ to see
+%
+\begin{align*}
+\P\big(\|S-T\|_p > 6\eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++\P\left(\|U\|_p>\frac{\eta}{6}\right).
+\end{align*}
+%
+Whenever $\pi_3 = 0$ we can set
+$t = 2^{1/4} \phi_p(d)^{1/4} \beta_{p,3}^{-1/4} \eta$,
+and with $\nu$ as above we obtain
+%
+\begin{align*}
+\P\big(\|S-T\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,3} \phi_p(d)^3}{\eta^4}
+\right)^{1/4}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}
++\P\left(\|U\|_p>\frac{\eta}{6}\right).
+\end{align*}
+%
+\end{proof}
+
+After establishing Proposition~\ref{pro:yurinskii_sa_simplified},
+Corollaries~\ref{cor:yurinskii_sa_mixingale},
+\ref{cor:yurinskii_sa_martingale},
+and \ref{cor:yurinskii_sa_indep} follow easily.
+
+\begin{proof}[Corollary~\ref{cor:yurinskii_sa_mixingale}]
+Proposition~\ref{pro:yurinskii_sa_simplified} with
+$\P ( \|U\|_p > \frac{\eta}{6} )
+\leq \frac{6}{\eta} \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$.
+\end{proof}
+
+\begin{proof}[Corollary~\ref{cor:yurinskii_sa_martingale}]
+By Proposition~\ref{pro:yurinskii_sa_simplified}
+with $U=0$ a.s.
+\end{proof}
+
+\begin{proof}[Corollary~\ref{cor:yurinskii_sa_indep}]
+By Corollary~\ref{cor:yurinskii_sa_martingale}
+with $\Omega=0$ a.s.
+\end{proof}
+
+We conclude this section with a discussion expanding on the comments made
+in Remark~\ref{rem:yurinskii_coupling_bounds_probability} on deriving bounds in
+probability from Yurinskii's coupling. Consider for illustration the
+independent data second-order result given in
+Corollary~\ref{cor:yurinskii_sa_indep}: for each $\eta > 0$,
+there exists $T_n \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying
+%
+\begin{align*}
+\P\big(\|S_n-T_n\|_p > \eta\big)
+&\leq
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3},
+\end{align*}
+%
+where here we make explicit the dependence on the sample size $n$ for clarity.
+The naive approach to converting this into a probability bound for
+$\|S_n-T_n\|_p$ is to select $\eta$ to ensure the right-hand side is
+of order $1$, arguing that the probability can then be made arbitrarily
+small by taking, in this case, $\eta$ to be a large enough multiple of
+$\beta_{p,2}^{1/3} \phi_p(d)^{2/3}$. However, the somewhat subtle mistake is
+in neglecting the fact that the realization of the coupling variable $T_n$
+will in general depend on $\eta$, rendering the resulting
+bound invalid.
+As an explicit example of this phenomenon, take $\eta > 1$ and suppose
+$\|S_n - T_n(\eta)\| = \eta$ with probability $1 - 1/\eta$ and
+$\|S_n - T_n(\eta)\| = n$ with probability $1/\eta$.
+Then $\P\big(\|S_n - T_n(\eta)\| > \eta\big) = 1/\eta$
+but it is not true for any $\eta$ that $\|S_n - T_n(\eta)\| \lesssim_\P 1$.
+
+We propose in Remark~\ref{rem:yurinskii_coupling_bounds_probability} the
+following fix.
+Instead of selecting $\eta$ to ensure the right-hand side is of order $1$,
+we instead choose it so the bound converges (slowly) to zero. This is
+easily achieved by taking the naive and incorrect bound and multiplying
+by some divergent sequence $R_n$. The resulting inequality reads,
+in the case of Corollary~\ref{cor:yurinskii_sa_indep} with
+$\eta = \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n$,
+%
+\begin{align*}
+\P\Big(\|S_n-T_n\|_p >
+\beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n
+\Big)
+&\leq
+\frac{24}{R_n}
+\to 0.
+\end{align*}
+%
+We thus recover, for the price of a rate which is slower by an arbitrarily
+small amount, a valid upper bound in probability, as we can immediately
+conclude that
+%
+\begin{align*}
+\|S_n-T_n\|_p
+\lesssim_\P
+\beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n.
+\end{align*}
+
+\subsection{Strong approximation for martingale empirical processes}
+
+We begin by presenting some calculations omitted from the main text
+relating to the motivating example of kernel density estimation with
+i.i.d.\ data.
+First, the bias is bounded as
+%
+\begin{align*}
+\big| \E \big[ \hat g(x) \big] - g(x) \big|
+&=
+\left|
+\int_{\frac{-x}{h}}^{\frac{1-x}{h}}
+K(\xi)
+\diff \xi
+- 1
+\right|
+\leq
+2 \int_{\frac{a}{h}}^\infty
+\frac{1}{\sqrt{2 \pi}}
+e^{-\frac{\xi^2}{2}}
+\diff \xi
+\leq
+\frac{h}{a}
+\sqrt{\frac{2}{\pi}}
+e^{-\frac{a^2}{2 h^2}}.
+\end{align*}
+%
+Next, we do the calculations necessary to apply
+Corollary~\ref{cor:yurinskii_sa_indep}.
+Define $k_{i j} = \frac{1}{n h} K \left( \frac{X_i - x_j}{h} \right)$ and
+$k_i = (k_{i j} : 1 \leq j \leq N)$.
+Then $\|k_i\|_\infty \leq \frac{1}{n h \sqrt{2 \pi}}$ a.s.\ and
+$\E[\|k_i\|_2^2] \leq \frac{N}{n^2 h} \int_{-\infty}^\infty K(\xi)^2 \diff \xi
+\leq \frac{N}{2 n^2 h \sqrt{\pi}}$.
+Let $V = \Var[k_i] \in \R^{N \times N}$,
+so assuming that $1/h \geq \log 2 N$,
+by Lemma~\ref{lem:yurinskii_app_gaussian_useful} we bound
+%
+\begin{align*}
+\beta_{\infty,2}
+&=
+n \E\left[\| k_i \|^2_2 \| k_i \|_\infty
+\right]
++ n \E \left[ \|V^{1/2} Z \|^2_2 \|V^{1/2} Z \|_\infty \right]
+\leq
+\frac{N}{\sqrt{8} n^2 h^2 \pi}
++ \frac{4 N \sqrt{\log 2 N}}{\sqrt{8} n^2 h^{3/2} \pi^{3/4}}
+\leq
+\frac{N}{n^2 h^2}.
+\end{align*}
+%
+Finally, we verify the stochastic continuity bounds.
+By the Lipschitz property of $K$, it is easy to show that
+for $x,x' \in \cX$ we have
+$\left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right)
+- \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|
+\lesssim \frac{|x-x'|}{h^2}$ almost surely, and also that
+$\E \Big[ \left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right)
+- \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|^2 \Big]
+\lesssim \frac{|x-x'|^2}{h^3}$.
+By chaining with the Bernstein--Orlicz norm and polynomial covering numbers,
+%
+\begin{align*}
+\sup_{|x-x'| \leq \delta}
+\big\|S(x) - S(x')\big\|_\infty
+\lesssim_\P
+\delta
+\sqrt{\frac{\log n}{n h^3}}
+\end{align*}
+%
+whenever $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$.
+By a Gaussian process maximal inequality
+\citep[Corollary~2.2.8]{van1996weak}
+the same bound holds for $T(x)$ with
+%
+\begin{align*}
+\sup_{|x-x'| \leq \delta}
+\big\|T(x) - T(x')\big\|_\infty
+\lesssim_\P
+\delta
+\sqrt{\frac{\log n}{n h^3}}.
+\end{align*}
+
+\begin{proof}[Lemma~\ref{lem:yurinskii_kde_eigenvalue}]
+
+For $x, x' \in [a, 1-a]$, the scaled covariance function
+of this nonparametric estimator is
+%
+\begin{align*}
+n h\, \Cov\big[\hat g(x), \hat g(x')\big]
+&=
+\frac{1}{h}
+\E \left[
+K \left( \frac{X_i - x}{h} \right)
+K \left( \frac{X_i - x'}{h} \right)
+\right] \\
+&\quad-
+\frac{1}{h}
+\E \left[
+K \left( \frac{X_i - x}{h} \right)
+\right]
+\E \left[
+K \left( \frac{X_i - x'}{h} \right)
+\right] \\
+&=
+\frac{1}{2 \pi}
+\int_{\frac{-x}{h}}^{\frac{1-x}{h}}
+\exp \left( - \frac{t^2}{2} \right)
+\exp \left( - \frac{1}{2} \left( t + \frac{x - x'}{h} \right)^2 \right)
+\diff t
+- h I(x) I(x')
+\end{align*}
+%
+where
+$I(x) = \frac{1}{\sqrt 2 \pi} \int_{-x/h}^{(1-x)/h} e^{-t^2/2} \diff t$.
+Completing the square and a substitution gives
+%
+\begin{align*}
+n h\, \Cov\big[\hat g(x), \hat g(x')\big]
+&=
+\frac{1}{2 \pi}
+\exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right)
+\int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}}
+\exp \left(-t^2\right)
+\diff t
+- h I(x) I(x').
+\end{align*}
+%
+Now we show that since $x, x'$ are not too close to the boundary
+of $[0,1]$,
+the limits in the above integral can be replaced by $\pm \infty$.
+Note that $\frac{-x-x'}{2h} \leq \frac{-a}{h}$
+and $\frac{2-x-x'}{2h} \geq \frac{a}{h}$ so
+%
+\begin{align*}
+\int_{-\infty}^{\infty}
+\exp \left(-t^2\right)
+\diff t
+- \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}}
+\exp \left(-t^2\right)
+\diff t
+\leq
+2 \int_{a/h}^\infty
+\exp \left(-t^2\right)
+\diff t
+\leq
+\frac{h}{a}
+\exp \left(- \frac{a^2}{h^2}\right).
+\end{align*}
+%
+Therefore, since
+$\int_{-\infty}^{\infty} e^{-t^2} \diff t = \sqrt \pi$,
+%
+\begin{align*}
+\left|
+n h\, \Cov\big[\hat g(x), \hat g(x')\big]
+- \frac{1}{2 \sqrt \pi}
+\exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right)
++ h I(x) I(x')
+\right|
+\leq
+\frac{h}{2 \pi a}
+\exp \left(- \frac{a^2}{h^2}\right).
+\end{align*}
+%
+Define the $N \times N$ matrix
+$\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi}
+\exp \left( - \frac{1}{4} \left( \frac{x_i-x_j}{h} \right)^2 \right)$.
+By \citet[Proposition~2.4,
+Proposition~2.5, and Equation~2.10]{baxter1994norm},
+with
+$\cB_k = \big\{b \in \R^\Z :
+\sum_{i \in \Z} \I\{b_i \neq 0\} \leq k \big\}$,
+%
+\begin{align*}
+\inf_{k \in \N}
+\inf_{b \in \R^k}
+\frac{\sum_{i=1}^k \sum_{j=1}^k b_i b_j \, e^{-\lambda(i-j)^2}}
+{\sum_{i=1}^k b_i^2}
+=
+\sqrt{\frac{\pi}{\lambda}}
+\sum_{i=-\infty}^{\infty}
+\exp \left( - \frac{(\pi e + 2 \pi i)^2}{4 \lambda} \right).
+\end{align*}
+%
+We use Riemann sums,
+noting that $\pi e + 2 \pi x = 0$ at
+$x = -e/2 \approx -1.359$.
+Consider the substitutions
+$\Z \cap (-\infty, -3] \mapsto (-\infty, -2]$,
+$\{-2, -1\} \mapsto \{-2, -1\}$, and
+$\Z \cap [0, \infty) \mapsto [-1, \infty)$.
+%
+\begin{align*}
+\sum_{i \in \Z}
+e^{-(\pi e + 2 \pi i)^2 / 4 \lambda}
+&\leq
+\int_{-\infty}^{-2}
+e^{ - (\pi e + 2 \pi x)^2/4 \lambda}
+\diff x
++ e^{- (\pi e - 4 \pi)^2/4 \lambda} \\
+&\quad+
+e^{ - (\pi e - 2 \pi)^2 / 4 \lambda}
++ \int_{-1}^{\infty}
+e^{ -(\pi e + 2 \pi x)^2 / 4 \lambda}
+\diff x.
+\end{align*}
+%
+Now use the substitution $t = \frac{\pi e + 2 \pi x}{2 \sqrt \lambda}$
+and suppose $\lambda < 1$, yielding
+%
+\begin{align*}
+\sum_{i \in \Z}
+e^{-(\pi e + 2 \pi i)^2 / 4 \lambda}
+&\leq
+\frac{\sqrt \lambda}{\pi}
+\int_{-\infty}^{\frac{\pi e - 4 \pi}{2 \sqrt \lambda}}
+e^{-t^2}
+\diff t
++ e^{- (\pi e - 4 \pi)^2/4 \lambda}
++ e^{ - (\pi e - 2 \pi)^2 / 4 \lambda}
++ \frac{\sqrt \lambda}{\pi}
+\int_{\frac{\pi e - 2 \pi}{2 \sqrt \lambda}}^{\infty}
+e^{-t^2}
+\diff t \\
+&\leq
+\left( 1 + \frac{1}{\pi} \frac{\lambda}{4 \pi - \pi e} \right)
+e^{-(\pi e - 4 \pi)^2 / 4 \lambda}
++
+\left( 1 + \frac{1}{\pi} \frac{\lambda}{\pi e - 2 \pi} \right)
+e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \\
+&\leq
+\frac{13}{12}
+e^{-(\pi e - 4 \pi)^2 / 4 \lambda}
++
+\frac{8}{7}
+e^{- (\pi e - 2 \pi)^2 / 4 \lambda}
+\leq
+\frac{9}{4}
+\exp \left( - \frac{5}{4 \lambda} \right).
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\inf_{k \in \N}
+\inf_{b \in \cB_k}
+\frac{\sum_{i \in \Z} \sum_{j \in \Z} b_i b_j \, e^{-\lambda(i-j)^2}}
+{\sum_{i \in \Z} b_i^2}
+< \frac{4}{\sqrt \lambda}
+\exp \left( - \frac{5}{4 \lambda} \right)
+< 4 e^{-1/\lambda}.
+\end{align*}
+%
+From this and since
+$\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} e^{-\lambda(i-j)^2}$
+with $\lambda = \frac{1}{4(N-1)^2 h^2} \leq \frac{\delta^2}{h^2}$,
+for each $h$ and some $\delta \leq h$,
+we have $\lambda_{\min}(\tilde\Sigma) \leq 2 e^{-h^2/\delta^2}$.
+Recall that
+%
+\begin{align*}
+\left|
+\Sigma_{i j}
+- \tilde\Sigma_{i j}
++ h I(x_i) I(x_j)
+\right|
+\leq
+\frac{h}{2 \pi a}
+\exp \left(- \frac{a^2}{h^2}\right).
+\end{align*}
+%
+For any positive semi-definite $N \times N$ matrices $A$ and $B$
+and vector $v$ we have $\lambda_{\min}(A - v v^\T) \leq \lambda_{\min}(A)$
+and $\lambda_{\min}(B) \leq \lambda_{\min}(A) + \|B-A\|_2
+\leq \lambda_{\min}(A) + N \|B-A\|_{\max}$.
+Hence with $I_i = I(x_i)$,
+%
+\begin{align*}
+\lambda_{\min}(\Sigma)
+&\leq
+\lambda_{\min}(\tilde\Sigma - h I I^\T)
++ \frac{N h}{2 \pi a}
+\exp \left(- \frac{a^2}{h^2}\right)
+\leq
+2 e^{-h^2/\delta^2}
++ \frac{h}{\pi a \delta}
+e^{-a^2 / h^2}.
+\end{align*}
+\end{proof}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_emp_proc}]
+
+Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$.
+Using a union bound, we can write
+%
+\begin{align*}
+&\P\left(\sup_{f \in \cF}
+\big| S(f) - T(f) \big|
+\geq 2t + \eta \right)
+\leq
+\P\left(\sup_{f \in \cF_\delta}
+\big| S(f) - T(f) \big|
+\geq \eta \right) \\
+&\qquad\qquad+
+\P\left(\sup_{d(f,f') \leq \delta}
+\big| S(f) - S(f') \big|
+\geq t \right)
++ \P\left(\sup_{d(f,f') \leq \delta}
+\big| T(f) - T(f') \big|
+\geq t \right).
+\end{align*}
+
+\proofparagraph{bounding the difference on $\cF_\delta$}
+
+We apply Corollary~\ref{cor:yurinskii_sa_martingale}
+with $p = \infty$ to the
+martingale difference sequence
+$\cF_\delta(X_i) = \big(f(X_i) : f \in \cF_\delta\big)$
+which takes values in $\R^{|\cF_\delta|}$.
+Square integrability can be assumed otherwise
+$\beta_\delta = \infty$.
+Note $\sum_{i=1}^n \cF_\delta(X_i) = S(\cF_\delta)$
+and $\phi_\infty(\cF_\delta) \leq \sqrt{2 \log 2 |\cF_\delta|}$.
+Therefore there exists a conditionally Gaussian vector $T(\cF_\delta)$
+with the same covariance structure as $S(\cF_\delta)$
+conditional on $\cH_0$ satisfying
+%
+\begin{align*}
+\P\left(
+\sup_{f \in \cF_\delta}
+\big| S(f) - T(f) \big|
+\geq \eta
+\right)
+&\leq
+\frac{24\beta_\delta^{\frac{1}{3}}
+(2\log 2 |\cF_\delta|)^{\frac{1}{3}}}{\eta}
++ 17\left(\frac{\sqrt{2 \log 2 |\cF_\delta|}
+\sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{\frac{2}{3}}.
+\end{align*}
+
+\proofparagraph{bounding the fluctuations in $S(f)$}
+
+Since $\big\| S(f) - S(f') \big\|_\psi \leq L d(f,f')$,
+by Theorem~2.2.4 in \citet{van1996weak}
+%
+\begin{align*}
+\left\|
+\sup_{d(f,f') \leq \delta}
+\big| S(f) - S(f') \big|
+\right\|_\psi
+&\leq
+C_\psi L
+\left(
+\int_0^\delta
+\psi^{-1}(N_\varepsilon) \diff{\varepsilon}
++ \delta \psi^{-1}(N_\delta^2)
+\right)
+= C_\psi L J_\psi(\delta).
+\end{align*}
+%
+Then, by Markov's inequality and the definition of the Orlicz norm,
+%
+\begin{align*}
+\P\left(
+\sup_{d(f,f') \leq \delta}
+\big| S(f) - S(f') \big|
+\geq t
+\right)
+&\leq
+\psi\left(\frac{t}{C_\psi L J_\psi(\delta)} \right)^{-1}.
+\end{align*}
+
+\proofparagraph{bounding the fluctuations in $T(f)$}
+
+By the Vorob'ev--Berkes--Philipp theorem
+\citep{dudley1999uniform},
+$T(\cF_\delta)$ extends to a conditionally Gaussian process $T(f)$.
+Firstly, since
+$\bigvvvert T(f) - T(f') \bigvvvert_2 \leq L d(f,f')$
+conditionally on $\cH_0$,
+and $T(f)$ is a conditional Gaussian process, we have
+$\big\| T(f) - T(f') \big\|_{\psi_2} \leq 2 L d(f,f')$
+conditional on $\cH_0$
+by \citet[Chapter~2.2, Complement~1]{van1996weak},
+where $\psi_2(x) = \exp(x^2) - 1$.
+Thus again by Theorem~2.2.4 in \citet{van1996weak},
+again conditioning on $\cH_0$,
+%
+\begin{align*}
+\left\|
+\sup_{d(f,f') \leq \delta}
+\big| T(f) - T(f') \big|
+\right\|_{\psi_2}
+&\leq
+C_1 L
+\int_0^\delta
+\sqrt{\log N_\varepsilon} \diff{\varepsilon}
+= C_1 L J_2(\delta)
+\end{align*}
+%
+for some universal constant $C_1 > 0$,
+where we used $\psi_2^{-1}(x) = \sqrt{\log(1+x)}$
+and monotonicity of covering numbers.
+Then by Markov's inequality and the definition of the Orlicz norm,
+%
+\begin{align*}
+\P\left(
+\sup_{d(f,f') \leq \delta}
+\big| T(f) - T(f') \big|
+\geq t
+\right)
+&\leq
+\left(
+\exp\left(
+\frac{t^2}{C_1^2 L^2 J_2(\delta)^2}
+\right) - 1
+\right)^{-1}
+\!\vee 1
+\leq
+2 \exp\left(
+\frac{-t^2}{C_1^2 L^2 J_2(\delta)^2}
+\right).
+\end{align*}
+%
+
+\proofparagraph{conclusion}
+
+The result follows by scaling $t$ and $\eta$
+and enlarging constants if necessary.
+%
+\end{proof}
+
+\subsection{Applications to nonparametric regression}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_series}]
+
+Proceed according to the decomposition in
+Section~\ref{sec:yurinskii_series}.
+By stationarity and Lemma~SA-2.1 in
+\citet{cattaneo2020large},
+we have $\sup_w \|p(w)\|_1 \lesssim 1$
+and also $\|H\|_1 \lesssim n/k$
+and $\|H^{-1}\|_1 \lesssim k/n$.
+
+\proofparagraph{bounding $\beta_{\infty,2}$ and $\beta_{\infty,3}$}
+
+Set $X_i = p(W_i) \varepsilon_i$
+so $S = \sum_{i=1}^n X_i$,
+and set $\sigma^2_i = \sigma^2(W_i)$ and
+$V_i = \Var[X_i \mid \cH_{i-1}] = \sigma_i^2 p(W_i) p(W_i)^\T$.
+Recall from Corollary~\ref{cor:yurinskii_sa_martingale} that for
+$r \in \{2,3\}$,
+%
+\begin{align*}
+\beta_{\infty,r}
+= \sum_{i=1}^n \E\left[\| X_i \|^r_2 \| X_i \|_\infty
++ \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right]
+\end{align*}
+%
+with $Z_i \sim \cN(0,1)$ i.i.d.\ and independent of $V_i$.
+For the first term, we use
+$\sup_w \|p(w)\|_2 \lesssim 1$
+and bounded third moments of $\varepsilon_i$:
+%
+\begin{align*}
+\E\left[ \| X_i \|^r_2 \| X_i \|_\infty \right]
+&\leq
+\E\left[ |\varepsilon_i|^3 \| p(W_i) \|^{r+1}_2 \right]
+\lesssim 1.
+\end{align*}
+%
+For the second term, apply Lemma~\ref{lem:yurinskii_app_gaussian_useful}
+conditionally on
+$\cH_n$ with $\sup_w \|p(w)\|_2 \lesssim 1$ to see
+%
+\begin{align*}
+&\E\left[ \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right]
+\lesssim
+\sqrt{\log 2k} \
+\E\left[
+\max_{1 \leq j \leq k}
+(V_i)_{j j}^{1/2}
+\bigg( \sum_{j=1}^k (V_i)_{j j} \bigg)^{r/2}
+\right] \\
+&\quad\lesssim
+\sqrt{\log 2k} \
+\E\left[
+\sigma_i^{r+1}
+\max_{1 \leq j \leq k}
+p(W_i)_j
+\bigg(
+\sum_{j=1}^k
+p(W_i)_{j}^2
+\bigg)^{r/2}
+\right]
+\lesssim
+\sqrt{\log 2k} \
+\E\left[
+\sigma_i^{r+1}
+\right]
+\lesssim
+\sqrt{\log 2k}.
+\end{align*}
+%
+Putting these together yields
+%
+$\beta_{\infty,2} \lesssim n \sqrt{\log 2k}$
+and $\beta_{\infty,3} \lesssim n \sqrt{\log 2k}$.
+
+\proofparagraph{bounding $\Omega$}
+
+Set $\Omega = \sum_{i=1}^n \big(V_i - \E[V_i] \big)$ so
+%
+\begin{align*}
+\Omega
+&= \sum_{i=1}^n
+\big(\sigma_i^2 p(W_i)p(W_i)^\T - \E\left[ \sigma_i^2 p(W_i)p(W_i)^\T
+\right]\big).
+\end{align*}
+%
+Observe that $\Omega_{j l}$ is the sum of a zero-mean
+strictly stationary $\alpha$-mixing sequence and so $\E[\Omega_{j l}^2]
+\lesssim n$ by
+Lemma~\ref{lem:yurinskii_app_variance_mixing}%
+\ref{it:yurinskii_app_variance_mixing_bounded}.
+Since the basis functions
+satisfy Assumption~3 in \citet{cattaneo2020large}, $\Omega$ has a bounded
+number of non-zero entries in each row, so by Jensen's inequality
+%
+\begin{align*}
+\E\left[
+\|\Omega\|_2
+\right]
+&\leq
+\E\left[
+\|\Omega\|_\rF
+\right]
+\leq
+\left(
+\sum_{j=1}^k
+\sum_{l=1}^k
+\E\left[
+\Omega_{j l}^2
+\right]
+\right)^{1/2}
+\lesssim \sqrt{n k}.
+\end{align*}
+%
+
+\proofparagraph{strong approximation}
+
+By Corollary~\ref{cor:yurinskii_sa_martingale} and the previous parts,
+with any sequence $R_n \to \infty$,
+%
+\begin{align*}
+\|S - T \|_\infty
+&\lesssim_\P
+\beta_{\infty,2}^{1/3} (\log 2k)^{1/3} R_n
++ \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \\
+&\lesssim_\P
+n^{1/3} \sqrt{\log 2k} R_n
++ (n k)^{1/4} \sqrt{\log 2k} R_n.
+\end{align*}
+%
+If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
+the third-order version of Corollary~\ref{cor:yurinskii_sa_martingale}
+applies since
+%
+\begin{align*}
+\pi_3
+&=
+\sum_{i=1}^{n}
+\sum_{|\kappa| = 3}
+\E \Big[ \big|
+\E [ X_i^\kappa \mid \cH_{i-1} ]
+\big| \Big]
+=
+\sum_{i=1}^{n}
+\sum_{|\kappa| = 3}
+\E \Big[ \big|
+p(W_i)^\kappa \,
+\E [ \varepsilon_i^3 \mid \cH_{i-1} ]
+\big| \Big]
+= 0,
+\end{align*}
+%
+giving
+%
+\begin{align*}
+\|S - T \|_\infty
+&\lesssim_\P
+\beta_{\infty,3}^{1/4} (\log 2k)^{3/8} R_n
++ \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n
+\lesssim_\P
+(n k)^{1/4} \sqrt{\log 2k} R_n.
+\end{align*}
+%
+By H{\"o}lder's inequality and with
+$\|H^{-1}\|_1 \lesssim k/n$ we have
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+p(w)^\T H^{-1} S
+- p(w)^\T H^{-1} T
+\right|
+&\leq
+\sup_{w \in \cW}
+\|p(w)\|_1
+\|H^{-1}\|_1
+\| S - T \|_\infty
+\lesssim
+n^{-1} k
+\| S - T \|_\infty.
+\end{align*}
+
+\proofparagraph{convergence of $\hat H$}
+
+We have
+$\hat H - H = \sum_{i=1}^n \big(p(W_i)p(W_i)^\T - \E\left[
+p(W_i)p(W_i)^\T \right]\big)$.
+Observe that $(\hat H - H)_{j l}$ is the sum of
+a zero-mean strictly stationary $\alpha$-mixing sequence and so
+$\E[(\hat H - H)_{j l}^2] \lesssim n$ by
+Lemma~\ref{lem:yurinskii_app_variance_mixing}%
+\ref{it:yurinskii_app_variance_mixing_bounded}.
+Since the basis
+functions satisfy Assumption~3 in \citet{cattaneo2020large},
+$\hat H-H$ has a
+bounded number of non-zero entries in each row and so by Jensen's inequality
+%
+\begin{align*}
+\E\left[
+\|\hat H-H\|_1
+\right]
+&=
+\E\left[
+\max_{1 \leq i \leq k}
+\sum_{j=1}^k
+\big|(\hat H-H)_{i j}\big|
+\right]
+\leq
+\E\left[
+\sum_{1 \leq i \leq k}
+\Bigg(
+\sum_{j=1}^k
+|(\hat H-H)_{i j}|
+\Bigg)^2
+\right]^{\frac{1}{2}}
+\lesssim \sqrt{n k}.
+\end{align*}
+
+\proofparagraph{bounding the matrix term}
+
+Note $\|\hat H^{-1}\|_1 \leq \|H^{-1}\|_1
++ \|\hat H^{-1}\|_1 \|\hat H-H\|_1 \|H^{-1}\|_1$
+so by the previous part, we deduce
+%
+\begin{align*}
+\|\hat H^{-1}\|_1
+\leq
+\frac{\|H^{-1}\|_1}
+{1 - \|\hat H-H\|_1 \|H^{-1}\|_1}
+\lesssim_\P
+\frac{k/n}
+{1 - \sqrt{n k}\, k/n}
+\lesssim_\P
+\frac{k}{n}
+\end{align*}
+%
+as $k^3 / n \to 0$. Note that by the martingale structure, since
+$p(W_i)$ is bounded and supported on a region with volume at most of the order
+$1/k$, and as $W_i$ has a Lebesgue density,
+%
+\begin{align*}
+\Var[T_j]
+&=
+\Var[S_j]
+=
+\Var\left[
+\sum_{i=1}^n \varepsilon_i p(W_i)_j
+\right]
+=
+\sum_{i=1}^n
+\E\left[
+\sigma_i^2 p(W_i)_j^2
+\right]
+\lesssim
+\frac{n}{k}.
+\end{align*}
+%
+So by the Gaussian maximal inequality in
+Lemma~\ref{lem:yurinskii_app_gaussian_pnorm},
+$\|T\|_\infty \lesssim_\P \sqrt{\frac{n \log 2k}{k}}$.
+Since $k^3/n \to 0$,
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+p(w)^\T (\hat H^{-1} - H^{-1}) S
+\right|
+&\leq
+\sup_{w \in \cW}
+\|p(w)^\T\|_1
+\|\hat H^{-1}\|_1
+\|\hat H - H\|_1
+\|H^{-1}\|_1
+\|S - T\|_\infty \\
+&\quad+
+\sup_{w \in \cW}
+\|p(w)^\T\|_1
+\|\hat H^{-1}\|_1
+\|\hat H - H\|_1
+\|H^{-1}\|_1
+\|T\|_\infty \\
+&\lesssim_\P
+\frac{k^2}{n^2}
+\sqrt{n k}
+\!\left(
+n^{1/3} \sqrt{\log 2k}
++ (n k)^{1/4} \sqrt{\log 2k}
+\right)
+\!+ \frac{k^2}{n^2}
+\sqrt{n k}
+\sqrt{\frac{n \log 2k}{k}} \\
+&\lesssim_\P
+\frac{k^2}{n}
+\sqrt{\log 2k}.
+\end{align*}
+%
+
+\proofparagraph{conclusion of the main result}
+
+By the previous parts,
+with $G(w) = p(w)^\T H^{-1} T$,
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\left|
+\hat\mu(w) - \mu(w)
+- p(w)^\T H^{-1} T
+\right| \\
+&\quad=
+\sup_{w \in \cW}
+\left|
+p(w)^\T H^{-1} (S - T)
++ p(w)^\T (\hat H^{-1} - H^{-1}) S
++ \Bias(w)
+\right| \\
+&\quad\lesssim_\P
+\frac{k}{n}
+\|S - T\|_\infty
++ \frac{k^2}{n} \sqrt{\log 2k}
++ \sup_{w \in \cW} |\Bias(w)| \\
+&\quad\lesssim_\P
+\frac{k}{n}
+\left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) R_n
++ \frac{k^2}{n} \sqrt{\log 2k}
++ \sup_{w \in \cW} |\Bias(w)| \\
+&\quad\lesssim_\P
+n^{-2/3} k \sqrt{\log 2k} R_n
++ n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n
++ \frac{k^2}{n} \sqrt{\log 2k}
++ \sup_{w \in \cW} |\Bias(w)| \\
+&\quad\lesssim_\P
+n^{-2/3} k \sqrt{\log 2k} R_n
++ \sup_{w \in \cW} |\Bias(w)|
+\end{align*}
+%
+since $k^3/n \to 0$.
+If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+\hat\mu(w) - \mu(w)
+- p(w)^\T H^{-1} T
+\right|
+&\lesssim_\P
+\frac{k}{n}
+\|S - T\|_\infty
++ \frac{k^2}{n} \sqrt{\log 2k}
++ \sup_{w \in \cW} |\Bias(w)| \\
+&\lesssim_\P
+n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n
++ \sup_{w \in \cW} |\Bias(w)|.
+\end{align*}
+%
+Finally, we verify the variance bounds for the Gaussian process.
+With $\sigma^2(w)$ bounded above,
+%
+\begin{align*}
+\Var[G(w)]
+&=
+p(w)^\T H^{-1}
+\Var\left[ \sum_{i=1}^n p(W_i) \varepsilon_i \right]
+H^{-1} p(w) \\
+&=
+p(w)^\T H^{-1}
+\E\left[\sum_{i=1}^n p(W_i) p(W_i)^\T \sigma^2(W_i) \right]
+H^{-1} p(w) \\
+&\lesssim
+\|p(w)\|_2^2 \|H^{-1}\|_2^2
+\|H\|_2
+\lesssim
+k/n.
+\end{align*}
+%
+Similarly, since $\sigma^2(w)$ is bounded away from zero,
+%
+\begin{align*}
+\Var[G(w)]
+&\gtrsim
+\|p(w)\|_2^2 \|H^{-1}\|_2^2
+\|H^{-1}\|_2^{-1}
+\gtrsim
+k/n.
+\end{align*}
+
+\proofparagraph{bounding the bias}
+
+We delegate the task of carefully deriving bounds on the bias to
+\citet{cattaneo2020large}, who provide a high-level assumption on the
+approximation error in Assumption~4 and then use it to derive bias bounds in
+Section~3 of the form $\sup_{w \in \cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$.
+This assumption is then verified for B-splines, wavelets, and piecewise
+polynomials in their supplemental appendix.
+
+\end{proof}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_series_feasible}]
+\proofparagraph{infeasible supremum approximation}
+
+Provided that the bias is negligible,
+for all $s > 0$ we have
+%
+\begin{align*}
+&\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}}
+\right| \leq t
+\right)
+-
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{G(w)}{\sqrt{\rho(w,w)}}
+\right| \leq t
+\right)
+\right| \\
+&\quad\leq
+\sup_{t \in \R}
+\P\left(
+t \leq
+\sup_{w \in \cW}
+\left|
+\frac{G(w)}{\sqrt{\rho(w,w)}}
+\right|
+\leq t + s
+\right)
++
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat\mu(w)-\mu(w)-G(w)}{\sqrt{\rho(w,w)}}
+\right| > s
+\right).
+\end{align*}
+%
+By the Gaussian anti-concentration result given as Corollary~2.1 in
+\citet{chernozhukov2014anti} applied to a discretization of $\cW$, the first
+term is at most $s \sqrt{\log n}$ up to a constant factor, and the second
+term converges to zero whenever
+$\frac{1}{s} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} \to 0$.
+Thus a suitable value of $s$ exists whenever $\frac{k^3(\log n)^6}{n} \to 0$.
+
+\proofparagraph{feasible supremum approximation}
+
+By \citet[Lemma~3.1]{chernozhukov2013gaussian} and discretization,
+with $\rho(w,w') = \E[\hat\rho(w,w')]$,
+%
+\begin{align*}
+&\sup_{t \in \R}
+\left|
+\P\left(
+\sup_{w \in \cW}
+\left|
+\frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}}
+\right|
+\leq t \biggm| \bW, \bY
+\right)
+- \P\left(
+\left|
+\frac{G(w)}{\sqrt{\rho(w,w)}}
+\right|
+\leq t
+\right)
+\right| \\
+&\quad\lesssim_\P
+\sup_{w,w' \in \cW}
+\left|
+\frac{\hat\rho(w,w')}
+{\sqrt{\hat\rho(w,w)\hat\rho(w',w')}}
+- \frac{\rho(w,w')}
+{\sqrt{\rho(w,w)\rho(w',w')}}
+\right|^{1/3}
+(\log n)^{2/3} \\
+&\quad\lesssim_\P
+\left(\frac n k \right)^{1/3}
+\sup_{w,w' \in \cW} |\hat\rho(w,w') - \rho(w,w')|^{1/3}
+(\log n)^{2/3} \\
+&\quad\lesssim_\P
+\left( \frac{n (\log n)^2}{k} \right)^{1/3}
+\sup_{w,w' \in \cW}
+\left|
+p(w)^\T \hat H^{-1}
+\left(
+\hat{V}[S]
+- \Var[S]
+\right)
+\hat H^{-1} p(w')
+\right|^{1/3} \\
+&\quad\lesssim_\P
+\left( \frac{k (\log n)^2}{n} \right)^{1/3}
+\left\|
+\hat{V}[S]
+- \Var[S]
+\right\|_2^{1/3},
+\end{align*}
+%
+and vanishes in probability whenever
+$\frac{k (\log n)^2}{n}
+\big\| \hat{V}[S] - \Var[S] \big\|_2 \to_\P 0$.
+For the plug-in estimator,
+%
+\begin{align*}
+&\left\|
+\hat{V}[S]
+- \Var[S]
+\right\|_2
+=
+\left\|
+\sum_{i=1}^n
+p(W_i) p(W_i^\T)
+\hat\sigma^2(W_i)
+- n \E\left[
+p(W_i) p(W_i^\T)
+\sigma^2(W_i)
+\right]
+\right\|_2 \\
+&\quad\lesssim_\P
+\sup_{w \in \cW}
+|\hat{\sigma}^2(w)-\sigma^2(w)|
+\, \big\| \hat H \big\|_2 \\
+&\qquad+
+\left\|
+\sum_{i=1}^n
+p(W_i) p(W_i^\T)
+\sigma^2(W_i)
+- n \E\left[
+p(W_i) p(W_i^\T)
+\sigma^2(W_i)
+\right]
+\right\|_2 \\
+&\quad\lesssim_\P
+\frac{n}{k}
+\sup_{w \in \cW}
+|\hat{\sigma}^2(w)-\sigma^2(w)|
++ \sqrt{n k},
+\end{align*}
+%
+where the second term is bounded by the same argument
+used to bound $\|\hat H - H\|_1$.
+Thus, the feasible approximation is valid whenever
+$(\log n)^2 \sup_{w \in \cW}
+|\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$
+and $\frac{k^3 (\log n)^4}{n} \to 0$.
+The validity of the uniform confidence band follows immediately.
+%
+\end{proof}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_local_poly}]
+
+We apply Proposition~\ref{pro:yurinskii_emp_proc}
+with the metric $d(f_w, f_{w'}) = \|w-w'\|_2$
+and the function class
+%
+\begin{align*}
+\cF
+&=
+\left\{
+(W_i, \varepsilon_i) \mapsto
+e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
+\varepsilon_i
+:\ w \in \cW
+\right\},
+\end{align*}
+%
+with $\psi$ chosen as a suitable Bernstein Orlicz function.
+
+\proofparagraph{bounding $H(w)^{-1}$}
+
+Recall that
+$H(w) = \sum_{i=1}^n \E[K_h(W_i-w) p_h(W_i-w)p_h(W_i-w)^\T]$
+and let $a(w) \in \R^k$ with $\|a(w)\|_2 = 1$.
+Since the density of $W_i$ is bounded away from zero on $\cW$,
+%
+\begin{align*}
+a(w)^\T H(w) a(w)
+&=
+n \E\left[
+\big( a(w)^\T p_h(W_i-w) \big)^2
+K_h(W_i-w)
+\right] \\
+&\gtrsim
+n \int_\cW
+\big( a(w)^\T p_h(u-w) \big)^2
+K_h(u-w)
+\diff{u}
+\gtrsim
+n \int_{\frac{\cW-w}{h}}
+\big( a(w)^\T p(u) \big)^2
+K(u)
+\diff{u}.
+\end{align*}
+%
+This is continuous in $a(w)$ on the compact set
+$\|a(w)\|_2 = 1$
+and $p(u)$ forms a polynomial basis so
+$a(w)^\T p(u)$ has finitely many zeroes.
+Since $K(u)$ is compactly supported
+and $h \to 0$,
+the above integral is eventually strictly positive
+for all $x \in \cW$,
+and hence is bounded below uniformly in $w \in \cW$
+by a positive constant.
+Therefore
+$\sup_{w \in \cW} \|H(w)^{-1}\|_2 \lesssim 1/n$.
+
+\proofparagraph{bounding $\beta_\delta$}
+
+Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$
+with cardinality $|\cF_\delta| \asymp \delta^{-m}$
+and let
+$\cF_\delta(W_i, \varepsilon_i)
+= \big(f(W_i, \varepsilon_i) : f\in \cF_\delta\big)$.
+Define the truncated errors
+$\tilde\varepsilon_i =
+\varepsilon_i\I\{-a \log n \leq \varepsilon_i \leq b \log n\}$
+and note that
+$\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$
+implies that
+$\P(\exists i: \tilde\varepsilon_i \neq \varepsilon_i)
+\lesssim n^{1-(a \vee b)/C_\varepsilon}$.
+Hence, by choosing $a$ and $b$ large enough,
+with high probability, we can replace all
+$\varepsilon_i$ by $\tilde\varepsilon_i$.
+Further, it is always possible to increase either $a$ or $b$
+along with some randomization to ensure that
+$\E[\tilde\varepsilon_i] = 0$.
+Since $K$ is bounded and compactly supported,
+$W_i$ has a bounded density and
+$|\tilde\varepsilon_i| \lesssim \log n$,
+%
+\begin{align*}
+\bigvvvert
+f(W_i, \tilde\varepsilon_i)
+\bigvvvert_2
+&=
+\E\left[
+\left|
+e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
+\tilde\varepsilon_i
+\right|^2
+\right]^{1/2} \\
+&\leq
+\E\left[
+\|H(w)^{-1}\|_2^2
+K_h(W_i-w)^2
+\|p_h(W_i-w)\|_2^2
+\sigma^2(W_i)
+\right]^{1/2} \\
+&\lesssim
+n^{-1}
+\E\left[
+K_h(W_i-w)^2
+\right]^{1/2}
+\lesssim
+n^{-1}
+h^{-m / 2}, \\
+\bigvvvert
+f(W_i, \tilde\varepsilon_i)
+\bigvvvert_\infty
+&\leq
+\bigvvvert
+\|H(w)^{-1}\|_2
+K_h(W_i-w)
+\|p_h(W_i-w)\|_2
+|\tilde\varepsilon_i|
+\bigvvvert_\infty \\
+&\lesssim
+n^{-1}
+\bigvvvert
+K_h(W_i-w)
+\bigvvvert_\infty
+\log n
+\lesssim
+n^{-1}
+h^{-m}
+\log n.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\E\left[
+\|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2
+\|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty
+\right]
+&\leq
+\!\sum_{f\in\cF_\delta}
+\!\bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2^2
+\max_{f\in\cF_\delta}
+\bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty
+\!\lesssim
+n^{-3} \delta^{-m} h^{-2m} \log n.
+\end{align*}
+%
+Let
+$V_i(\cF_\delta) =
+\E\big[\cF_\delta(W_i, \tilde\varepsilon_i)
+\cF_\delta(W_i, \tilde\varepsilon_i)^\T
+\mid \cH_{i-1}\big]$
+and $Z_i \sim \cN(0, I_d)$ be i.i.d.\ and
+independent of $\cH_n$.
+Note that
+$V_i(f,f) = \E[f(W_i, \tilde\varepsilon_i)^2 \mid W_i]
+\lesssim n^{-2} h^{-2m}$
+and
+$\E[V_i(f,f)] = \E[f(W_i, \tilde\varepsilon_i)^2]
+\lesssim n^{-2} h^{-m}$.
+Thus by Lemma~\ref{lem:yurinskii_app_gaussian_useful},
+%
+\begin{align*}
+\E\left[
+\big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
+\big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
+\right]
+&=
+\E\left[
+\E\left[
+\big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
+\big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
+\mid \cH_n
+\right]
+\right] \\
+&\leq
+4 \sqrt{\log 2|\cF_\delta|}
+\,\E\Bigg[
+\max_{f \in \cF_\delta} \sqrt{V_i(f,f)}
+\sum_{f \in \cF_\delta} V_i(f,f)
+\Bigg] \\
+&\lesssim
+n^{-3}
+h^{-2m}
+\delta^{-m}
+\sqrt{\log(1/\delta)}.
+\end{align*}
+%
+Thus since $\log(1/\delta) \asymp \log(1/h) \asymp\log n$,
+%
+\begin{align*}
+\beta_\delta
+&=
+\sum_{i=1}^n
+\E\left[
+\|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2
+\|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty
++ \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2
+\big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty
+\right]
+\lesssim
+\frac{\log n}
+{n^2 h^{2m} \delta^m}.
+\end{align*}
+
+\proofparagraph{bounding $\Omega_\delta$}
+
+Let $C_K>0$ be the radius of a $\ell^2$-ball
+containing the support of $K$
+and note that
+%
+\begin{align*}
+\left|
+V_i(f,f')
+\right|
+&=
+\Big|
+\E\Big[
+e_1^\T H(w)^{-1}
+p_h(W_i-w)
+e_1^\T H(w')^{-1}
+p_h(W_i-w') \\
+&\qquad\times
+K_h(W_i-w)
+K_h(W_i-w')
+\tilde\varepsilon_i^2
+\Bigm| \cH_{i-1}
+\Big]
+\Big| \\
+&\lesssim
+n^{-2}
+K_h(W_i-w)
+K_h(W_i-w') \\
+&\lesssim
+n^{-2}
+h^{-m}
+K_h(W_i-w)
+\I\{\|w-w'\|_2 \leq 2 C_K h\}.
+\end{align*}
+%
+Since $W_i$ are $\alpha$-mixing
+with $\alpha(j) < e^{-2j / C_\alpha}$,
+Lemma~\ref{lem:yurinskii_app_variance_mixing}%
+\ref{it:yurinskii_app_variance_mixing_exponential}
+with $r=3$ gives
+%
+\begin{align*}
+&\Var\left[
+\sum_{i=1}^n V_i(f,f')
+\right] \\
+&\quad\lesssim
+\sum_{i=1}^n
+\E\left[
+|V_i(f,f')|^3
+\right] ^{2/3}
+\lesssim
+n^{-3} h^{-2m}
+\E\left[
+K_h(W_i-w)^3
+\right] ^{2/3}
+\I\{\|w-w'\|_2 \leq 2 C_K h\} \\
+&\quad\lesssim
+n^{-3} h^{-2m}
+(h^{-2m})^{2/3}
+\I\{\|w-w'\|_2 \leq 2 C_K h\} \\
+&\quad\lesssim
+n^{-3} h^{-10m/3}
+\I\{\|w-w'\|_2 \leq 2 C_K h\}.
+\end{align*}
+%
+Therefore, by Jensen's inequality,
+%
+\begin{align*}
+\E\big[ \|\Omega_\delta\|_2 \big]
+&\leq
+\E\big[ \|\Omega_\delta\|_\rF \big]
+\leq
+\E\Bigg[
+\sum_{f,f' \in \cF_\delta}
+(\Omega_\delta)_{f,f'}^2
+\Bigg]^{1/2}
+\leq
+\Bigg(
+\sum_{f,f' \in \cF_\delta}
+\Var\left[
+\sum_{i=1}^n V_i(f,f')
+\right]
+\Bigg)^{1/2} \\
+&\lesssim
+n^{-3/2} h^{-5m/3}
+\Bigg(
+\sum_{f,f' \in \cF_\delta}
+\I\{\|w-w'\|_2 \leq 2 C_K h\}
+\Bigg)^{1/2} \\
+&\lesssim
+n^{-3/2} h^{-5m/3}
+\big(h^{m} \delta^{-2m} \big)^{1/2}
+\lesssim
+n^{-3/2}
+h^{-7m/6}
+\delta^{-m}.
+\end{align*}
+%
+Note that we could have used
+$\|\cdot\|_1$ rather than $\|\cdot\|_\rF$,
+but this term is negligible either way.
+
+\proofparagraph{regularity of the stochastic processes}
+
+For each $f, f' \in \cF$,
+define the mean-zero and $\alpha$-mixing random variables
+%
+\begin{align*}
+u_i(f,f')
+&=
+e_1^\T
+\big(
+H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
+- H(w')^{-1} K_h(W_i-w') p_h(W_i-w')
+\big)
+\tilde\varepsilon_i.
+\end{align*}
+%
+Note that for all $1 \leq j \leq k$,
+by the Lipschitz property of the kernel and monomials,
+%
+\begin{align*}
+&\left|
+K_h(W_i-w) - K_h(W_i-w')
+\right| \\
+&\quad\lesssim
+h^{-m-1}
+\|w-w'\|_2
+\big(
+\I\{\|W_i-w\| \leq C_K h\}
++ \I\{\|W_i-w'\| \leq C_K h\}
+\big), \\
+&\left|
+p_h(W_i-w)_j - p_h(W_i-w')_j
+\right|
+\lesssim
+h^{-1}
+\|w-w'\|_2,
+\end{align*}
+%
+to deduce that for any $1 \leq j,l \leq k$,
+%
+\begin{align*}
+\big| H(w)_{j l} - H(w')_{j l} \big|
+&=
+\big|
+n \E\big[
+K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \\
+&\qquad-
+K_h(W_i-w') p_h(W_i-w')_j p_h(W_i-w')_l
+\big]
+\big| \\
+&\leq
+n\E\left[
+\left|
+K_h(W_i-w) - K_h(W_i-w')
+\right|
+\left|
+p_h(W_i-w)_j
+p_h(W_i-w)_l
+\right|
+\right] \\
+&\quad+
+n\E\left[
+\left|
+p_h(W_i-w)_j - p_h(W_i-w')_j
+\right|
+\left|
+K_h(W_i-w')
+p_h(W_i-w)_l
+\right|
+\right] \\
+&\quad+
+n\E\left[
+\left|
+p_h(W_i-w)_l - p_h(W_i-w')_l
+\right|
+\left|
+K_h(W_i-w')
+p_h(W_i-w')_j
+\right|
+\right] \\
+&\lesssim
+n h^{-1}\|w-w'\|_2.
+\end{align*}
+%
+Therefore, as the dimension of the matrix $H(w)$ is fixed,
+%
+\begin{align*}
+\big\| H(w)^{-1} - H(w')^{-1} \big\|_2
+&\leq
+\big\| H(w)^{-1}\big\|_2
+\big\| H(w')^{-1}\big\|_2
+\big\| H(w) - H(w') \big\|_2
+\lesssim
+\frac{\|w-w'\|_2}{n h}.
+\end{align*}
+%
+Hence
+%
+\begin{align*}
+\big| u_i(f,f') \big|
+&\leq
+\big\|
+H(w)^{-1} K_h(W_i-w) p_h(W_i-w)
+- H(w')^{-1} K_h(W_i-w') p_h(W_i-w')
+\tilde\varepsilon_i
+\big\|_2 \\
+&\leq
+\big\| H(w)^{-1} - H(w')^{-1} \big\|_2
+\big\| K_h(W_i-w) p_h(W_i-w)
+\tilde\varepsilon_i
+\big\|_2 \\
+&\quad+
+\big| K_h(W_i-w) - K_h(W_i-w') \big|
+\big\| H(w')^{-1} p_h(W_i-w)
+\tilde\varepsilon_i
+\big\|_2 \\
+&\quad+
+\big\| p_h(W_i-w) - p_h(W_i-w') \big\|_2
+\big\| H(w')^{-1} K_h(W_i-w')
+\tilde\varepsilon_i \big\|_2 \\
+&\lesssim
+\frac{\|w-w'\|_2}{n h}
+\big| K_h(W_i-w) \tilde\varepsilon_i \big|
++ \frac{1}{n}
+\big| K_h(W_i-w) - K_h(W_i-w') \big|
+\,|\tilde\varepsilon_i| \\
+&\lesssim
+\frac{\|w-w'\|_2 \log n}{n h^{m+1}},
+\end{align*}
+%
+and from the penultimate line, we also deduce that
+%
+\begin{align*}
+\Var[u_i(f,f')]
+&\lesssim
+\frac{\|w-w'\|_2^2}{n^2h^2}
+\E\left[
+K_h(W_i-w)^2 \sigma^2(X_i)
+\right] \\
+&\quad+
+\frac{1}{n^2}
+\E\left[
+\big( K_h(W_i-w) - K_h(W_i-w') \big)^2
+\sigma^2(X_i)
+\right]
+\lesssim
+\frac{\|w-w'\|_2^2}{n^2h^{m+2}}.
+\end{align*}
+%
+Further, $\E[u_i(f,f') u_j(f,f')] = 0$ for $i \neq j$ so
+by Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
+\ref{it:yurinskii_app_exponential_mixing_bernstein},
+for a constant $C_1>0$,
+%
+\begin{align*}
+\P\left(
+\Big| \sum_{i=1}^n u_i(f,f') \Big|
+\geq \frac{C_1 \|w-w'\|_2}{\sqrt n h^{m/2+1}}
+\left(
+\sqrt{t}
++ \sqrt{\frac{(\log n)^2}{n h^m}} \sqrt t
++ \sqrt{\frac{(\log n)^6}{n h^m}} t
+\right)
+\right)
+&\leq
+C_1 e^{-t}.
+\end{align*}
+%
+Therefore, adjusting the constant if necessary
+and since $n h^{m} \gtrsim (\log n)^7$,
+%
+\begin{align*}
+\P\left(
+\Big| \sum_{i=1}^n u_i(f,f') \Big|
+\geq
+\frac{C_1 \|w-w'\|_2}{\sqrt{n} h^{m/2+1}}
+\left(
+\sqrt{t} + \frac{t}{\sqrt{\log n}}
+\right)
+\right)
+&\leq
+C_1 e^{-t}.
+\end{align*}
+%
+\Citet[Lemma~2]{van2013bernstein} with
+$\psi(x) =
+\exp\Big(\big(\sqrt{1+2 x / \sqrt{\log n}}-1 \big)^2
+\log n \Big)-1$
+now shows that
+%
+\begin{align*}
+\Bigvvvert \sum_{i=1}^n u_i(f,f') \Bigvvvert_\psi
+&\lesssim
+\frac{\|w-w'\|_2}{\sqrt{n} h^{m/2+1}}
+\end{align*}
+%
+so we take $L = \frac{1}{\sqrt{n} h^{m/2+1}}$.
+Noting
+$\psi^{-1}(t) = \sqrt{\log(1+t)} + \frac{\log(1+t)}{2\sqrt{\log n}}$
+and $N_\delta \lesssim \delta^{-m}$,
+%
+\begin{align*}
+J_\psi(\delta)
+&=
+\int_0^\delta
+\psi^{-1}\big( N_\varepsilon \big)
+\diff{\varepsilon}
++ \delta
+\psi^{-1} \big( N_\delta \big)
+\lesssim
+\frac{\delta \log(1/\delta)}{\sqrt{\log n}}
++ \delta \sqrt{\log(1/\delta)}
+\lesssim
+\delta \sqrt{\log n}, \\
+J_2(\delta)
+&=
+\int_0^\delta
+\sqrt{\log N_\varepsilon}
+\diff{\varepsilon}
+\lesssim
+\delta \sqrt{\log(1/\delta)}
+\lesssim
+\delta \sqrt{\log n}.
+\end{align*}
+
+\proofparagraph{strong approximation}
+
+Recalling that
+$\tilde\varepsilon_i = \varepsilon_i$
+for all $i$ with high probability,
+by Proposition~\ref{pro:yurinskii_emp_proc},
+for all $t, \eta > 0$ there exists a
+zero-mean Gaussian process $T(w)$ satisfying
+%
+\begin{align*}
+\E\left[
+\left(\sum_{i=1}^n f_w(W_i, \varepsilon_i)\right)
+\left(\sum_{i=1}^n f_{w'}(W_i, \varepsilon_i)\right)
+\right]
+&= \E\big[ T(w) T(w')
+\big]
+\end{align*}
+%
+for all $w, w' \in \cW$ and
+%
+\begin{align*}
+&\P\left(
+\sup_{w \in \cW}
+\left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
+- T(w) \right|
+\geq C_\psi(t + \eta)
+\right) \\
+&\quad\leq
+C_\psi
+\inf_{\delta > 0}
+\inf_{\cF_\delta}
+\Bigg\{
+\frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta }
++ \left(\frac{\sqrt{\log 2 |\cF_\delta|}
+\sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} \\
+&\qquad+
+\psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1}
++ \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right)
+\Bigg\} \\
+&\quad\leq
+C_\psi
+\Bigg\{
+\frac{
+\left(\frac{\log n} {n^2 h^{2m} \delta^{m}} \right)^{1/3}
+(\log n)^{1/3}}{\eta }
++ \left(\frac{\sqrt{\log n}
+\sqrt{n^{-3/2} h^{-7m/6} \delta^{-m}}
+}{\eta }\right)^{2/3} \\
+&\qquad+
+\psi\left(\frac{t}{\frac{1}{\sqrt{n} h^{m/2+1}}
+J_\psi(\delta)}\right)^{-1}
++ \exp\left(\frac{-t^2}{
+\left( \frac{1}{\sqrt{n} h^{m/2+1}} \right)^2
+J_2(\delta)^2}\right)
+\Bigg\} \\
+&\quad\leq
+C_\psi
+\Bigg\{
+\frac{
+(\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3} \eta}
++ \left(\frac{
+n^{-3/4} h^{-7m/12} \delta^{-m/2} \sqrt{\log n}}
+{\eta }\right)^{2/3} \\
+&\qquad+
+\psi\left(\frac{t\sqrt{n} h^{m/2+1}}
+{\delta \sqrt{\log n}}\right)^{-1}
++ \exp\left(\frac{-t^2n h^{m+2}}
+{\delta^2 \log n}\right)
+\Bigg\}.
+\end{align*}
+%
+Noting $\psi(x) \geq e^{x^2/4}$ for $x \leq 4 \sqrt{\log n}$,
+any $R_n \to \infty$ gives the probability bound
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
+- T(w) \right|
+&\lesssim_\P
+\frac{(\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3}} R_n
++ \frac{\sqrt{\log n}}{n^{3/4} h^{7m/12} \delta^{m/2}} R_n
++ \frac{\delta \sqrt{\log n}} {\sqrt{n} h^{m/2+1}}.
+\end{align*}
+%
+Optimizing over $\delta$ gives
+$\delta \asymp \left(\frac{\log n}{n h^{m-6}}\right)^{\frac{1}{2m+6}}
+= h \left( \frac{\log n}{n h^{3m}} \right)^{\frac{1}{2m+6}}$
+and so
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i)
+- T(w) \right|
+&\lesssim_\P
+\left(
+\frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}}
+\right)^{\frac{1}{2m+6}} R_n.
+\end{align*}
+
+\proofparagraph{convergence of $\hat H(w)$}
+
+For $1 \leq j,l \leq k$
+define the zero-mean random variables
+%
+\begin{align*}
+u_{i j l}(w)
+&=
+K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l
+- \E\big[K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \big]
+\end{align*}
+%
+and note that
+$|u_{i j l}(w)| \lesssim h^{-m}$.
+By Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
+\ref{it:yurinskii_app_exponential_mixing_bounded}
+for a constant $C_2 > 0$ and all $t > 0$,
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n
+u_{i j l}(w)
+\right|
+> C_2 h^{-m} \big( \sqrt{n t}
++ (\log n)(\log \log n) t \big)
+\right)
+&\leq
+C_2 e^{-t}.
+\end{align*}
+%
+Further, note that by Lipschitz properties,
+%
+\begin{align*}
+\left|
+\sum_{i=1}^n u_{i j l}(w)
+- \sum_{i=1}^n u_{i j l}(w')
+\right|
+&\lesssim
+h^{-m-1} \|w-w'\|_2
+\end{align*}
+%
+so there is a $\delta$-cover of $(\cW, \|\cdot\|_2)$
+with size at most $n^a \delta^{-a}$ for some $a > 0$.
+Adjusting $C_2$,
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\left|
+\sum_{i=1}^n
+u_{i j l}(w)
+\right|
+> C_2 h^{-m} \big( \sqrt{n t}
++ (\log n)(\log \log n) t \big)
++ C_2 h^{-m-1} \delta
+\right)
+&\leq
+C_2 n^a \delta^{-a}
+e^{-t}
+\end{align*}
+%
+and hence
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+\sum_{i=1}^n
+u_{i j l}(w)
+\right|
+&\lesssim_\P
+h^{-m} \sqrt{n \log n}
++ h^{-m} (\log n)^3
+\lesssim_\P
+\sqrt{\frac{n \log n}{h^{2m}}}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\sup_{w\in\cW} \|\hat H(w)-H(w)\|_2
+&\lesssim_\P
+\sqrt{\frac{n \log n}{h^{2m}}}.
+\end{align*}
+
+\proofparagraph{bounding the matrix term}
+
+Firstly, note that
+since $\sqrt{\frac{\log n}{n h^{2m}}} \to 0$,
+we have that uniformly in $w \in \cW$
+%
+\begin{align*}
+\|\hat H(w)^{-1}\|_2
+\leq
+\frac{\|H(w)^{-1}\|_2}
+{1 - \|\hat H(w)-H(w)\|_2 \|H(w)^{-1}\|_2}
+&\lesssim_\P
+\frac{1/n}
+{1 - \sqrt{\frac{n \log n}{h^{2m}}} \frac{1}{n}}
+\lesssim_\P
+\frac{1}{n}.
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+&\sup_{w \in \cW}
+\big|
+e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big)
+S(w)
+\big|
+\leq
+\sup_{w \in \cW}
+\big\|\hat H(w)^{-1} - H(w)^{-1}\big\|_2
+\|S(w)\|_2 \\
+&\quad\leq
+\sup_{w \in \cW}
+\big\|\hat H(w)^{-1}\big\|_2
+\big\|H(w)^{-1}\big\|_2
+\big\|\hat H(w) - H(w)\big\|_2
+\|S(w)\|_2
+\lesssim_\P
+\sqrt{\frac{\log n}{n^3 h^{2m}}}
+\sup_{w \in \cW}
+\|S(w)\|_2.
+\end{align*}
+%
+Now for $1 \leq j \leq k$ write
+$u_{i j}(w) = K_h(W_i-w) p_h(W_i-w)_j \tilde \varepsilon_i$
+so that $S(w)_j = \sum_{i=1}^n u_{i j}(w)$ with high probability.
+Note that $u_{i j}(w)$ are zero-mean with
+$\Cov[u_{i j}(w), u_{i' j}(w)] = 0$ for $ i \neq i'$.
+Also $|u_{i j}(w)| \lesssim h^{-m} \log n$
+and $\Var[u_{i j}(w)] \lesssim h^{-m}$.
+By Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
+\ref{it:yurinskii_app_exponential_mixing_bernstein}
+for a constant $C_3>0$,
+%
+\begin{align*}
+\P\left(
+\Big| \sum_{i=1}^n u_{i j}(w) \Big|
+\geq C_3 \big( (h^{-m/2} \sqrt n + h^{-m} \log n) \sqrt t
++ h^{-m} (\log n)^3 t \big)
+\right)
+&\leq
+C_3 e^{-t}, \\
+\P\left(
+\Big| \sum_{i=1}^n u_{i j}(w) \Big|
+>
+C_3 \left(
+\sqrt{\frac{tn}{h^{m}}}
++ \frac{t(\log n)^3}{h^{m}}
+\right)
+\right)
+&\leq
+C_3 e^{-t},
+\end{align*}
+%
+where we used $n h^{m} \gtrsim (\log n)^2$
+and adjusted the constant if necessary.
+As before,
+$u_{i j}(w)$ is Lipschitz in $w$ with a constant which is at most
+polynomial in $n$,
+so for some $a>0$
+%
+\begin{align*}
+\P\left(
+\sup_{w \in \cW}
+\Big| \sum_{i=1}^n u_{i j}(w) \Big|
+>
+C_3 \left(
+\sqrt{\frac{tn}{h^{m}}}
++ \frac{t(\log n)^3}{h^{m}}
+\right)
+\right)
+&\leq
+C_3 n^a e^{-t}, \\
+\sup_{w \in \cW}
+\|S(w)\|_2
+\lesssim_\P
+\sqrt{\frac{n \log n}{h^{m}}}
++ \frac{(\log n)^4}{h^{m}}
+&\lesssim_\P
+\sqrt{\frac{n \log n}{h^{m}}}
+\end{align*}
+%
+as $n h^m \gtrsim (\log n)^7$.
+Finally,
+%
+\begin{align*}
+\sup_{w \in \cW}
+\big|
+e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big)
+S(w)
+\big|
+&\lesssim_\P
+\sqrt{\frac{\log n}{n^3 h^{2m}}}
+\sqrt{\frac{n \log n}{h^{m}}}
+\lesssim_\P
+\frac{\log n}{\sqrt{n^2 h^{3m}}}.
+\end{align*}
+
+\proofparagraph{bounding the bias}
+
+Since $\mu \in \cC^\gamma$, we have, by the multivariate version of Taylor's
+theorem,
+%
+\begin{align*}
+\mu(W_i)
+&=
+\sum_{|\kappa|=0}^{\gamma-1}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w)
+(W_i-w)^\kappa
++ \sum_{|\kappa|=\gamma}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w')
+(W_i-w)^\kappa
+\end{align*}
+%
+for some $w'$ on the line segment connecting
+$w$ and $W_i$.
+Now since $p_h(W_i-w)_1 = 1$,
+%
+\begin{align*}
+&e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(w) \\
+&\quad=
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T e_1 \mu(w)
+= e_1^\T e_1 \mu(w) = \mu(w).
+\end{align*}
+%
+Therefore
+%
+\begin{align*}
+\Bias(w)
+&=
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i)
+- \mu(w) \\
+&=
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \\
+&\quad\times
+\Bigg(
+\sum_{|\kappa|=0}^{\gamma-1}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w)
+(W_i-w)^\kappa
++ \sum_{|\kappa|=\gamma}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w')
+(W_i-w)^\kappa
+- \mu(w)
+\Bigg) \\
+&=
+\sum_{|\kappa|=1}^{\gamma-1}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w)
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+(W_i-w)^\kappa \\
+&\quad+
+\sum_{|\kappa|=\gamma}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w')
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+(W_i-w)^\kappa \\
+&=
+\sum_{|\kappa|=\gamma}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w')
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+(W_i-w)^\kappa,
+\end{align*}
+%
+where we used that
+$p_h(W_i-w)$ is a vector containing monomials
+in $W_i-w$ of order up to $\gamma$, so
+$e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+(W_i-w)^\kappa = 0$
+whenever $1 \leq |\kappa| \leq \gamma$.
+Finally,
+%
+\begin{align*}
+\sup_{w\in\cW}
+|\Bias(w)|
+&=
+\sup_{w\in\cW}
+\Bigg|
+\sum_{|\kappa|=\gamma}
+\frac{1}{\kappa!}
+\partial^{\kappa} \mu(w')
+e_1^\T \hat H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+(W_i-w)^\kappa
+\Bigg| \\
+&\lesssim_\P
+\sup_{w\in\cW}
+\max_{|\kappa| = \gamma}
+\left|
+\partial^{\kappa} \mu(w')
+\right|
+\|\hat H(w)^{-1}\|_2
+\Bigg\|
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+\Bigg\|_2
+h^\gamma \\
+&\lesssim_\P
+\frac{h^\gamma}{n}
+\sup_{w\in\cW}
+\Bigg\|
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w)
+\Bigg\|_2.
+\end{align*}
+%
+Write
+$\tilde u_{i j}(w) = K_h(W_i-w)p_h(W_i-w)_j$
+and note $|\tilde u_{i j}(w)| \lesssim h^{-m}$
+and $\E[\tilde u_{i j}(w)] \lesssim 1$, so
+%
+\begin{align*}
+\P\left(
+\left|
+\sum_{i=1}^n \tilde u_{i j}(w)
+- \E\left[
+\sum_{i=1}^n \tilde u_{i j}(w)
+\right]
+\right|
+> C_4 h^{-m} \big( \sqrt{n t}
++ (\log n)(\log \log n) t \big)
+\right)
+&\leq
+C_4 e^{-t}
+\end{align*}
+%
+by Lemma~\ref{lem:yurinskii_app_exponential_mixing}%
+\ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_4$,
+By Lipschitz properties, this implies
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|
+\sum_{i=1}^n \tilde u_{i j}(w)
+\right|
+&\lesssim_\P
+n
+\left(
+1 + \sqrt{\frac{\log n}{n h^{2m}}}
+\right)
+\lesssim_\P
+n.
+\end{align*}
+%
+Therefore
+$\sup_{w\in\cW} |\Bias(w)|
+\lesssim_\P n h^\gamma / n
+\lesssim_\P h^\gamma$.
+
+\proofparagraph{conclusion}
+
+By the previous parts,
+%
+\begin{align*}
+\sup_{w \in \cW}
+\left|\hat \mu(w) - \mu(w) - T(w) \right|
+&\leq
+\sup_{w \in \cW}
+\left|e_1^\T H(w)^{-1} S(w) - T(w) \right| \\
+&\quad+
+\sup_{w \in \cW}
+\left| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \right|
++ \sup_{w \in \cW}
+|\Bias(w)| \\
+&\lesssim_\P
+\left(
+\frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}}
+\right)^{\frac{1}{2m+6}} R_n
++ \frac{\log n}{\sqrt{n^2 h^{3m}}}
++ h^\gamma \\
+&\lesssim_\P
+\frac{R_n}{\sqrt{n h^m}}
+\left(
+\frac{(\log n)^{m+4}}{n h^{3m}}
+\right)^{\frac{1}{2m+6}}
++ h^\gamma,
+\end{align*}
+%
+where the last inequality follows because
+$n h^{3m} \to \infty$
+and $\frac{1}{2m+6} \leq \frac{1}{2}$.
+Finally, we verify the upper and lower bounds
+on the variance of the Gaussian process.
+Since the spectrum of $H(w)^{-1}$
+is bounded above and below by $1/n$,
+%
+\begin{align*}
+\Var[T(w)]
+&=
+\Var\left[
+e_1^\T H(w)^{-1}
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i
+\right] \\
+&=
+e_1^\T H(w)^{-1}
+\Var\left[
+\sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i
+\right]
+H(w)^{-1} e_1^\T \\
+&\lesssim
+\|H(w)^{-1}\|_2^2
+\max_{1 \leq j \leq k}
+\sum_{i=1}^n
+\Var\big[
+K_h(W_i-w) p_h(W_i-w)_j \sigma(W_i)
+\big] \\
+&\lesssim
+\frac{1}{n^2} n
+\frac{1}{h^m}
+\lesssim
+\frac{1}{n h^m}.
+\end{align*}
+%
+Similarly,
+$\Var[T(w)] \gtrsim \frac{1}{n h^m}$
+by the same argument used to bound eigenvalues of
+$H(w)^{-1}$.
+%
+\end{proof}
+
+\section{High-dimensional central limit theorems for martingales}%
+\label{sec:yurinskii_app_high_dim_clt}
+
+We present an application of our main results to
+high-dimensional central limit theorems for martingales. Our main
+contribution here is the generality of our results, which are broadly
+applicable to martingale data and impose minimal extra assumptions. In exchange
+for the scope and breadth of our results, we naturally do not necessarily
+achieve state-of-the-art distributional approximation errors in certain special
+cases, such as with independent data or when restricting the class of sets over
+which the central limit theorem must hold. Extensions of our high-dimensional
+central limit theorem results to mixingales and other approximate martingales,
+along with third-order refinements and Gaussian mixture target distributions,
+are possible through methods akin to those used to establish our main results
+in Section~\ref{sec:yurinskii_main_results}, but we omit these for succinctness.
+
+Our approach to deriving a high-dimensional martingale central limit theorem
+proceeds as follows. Firstly, the upcoming
+Proposition~\ref{pro:yurinskii_app_clt} uses our
+main result on martingale coupling
+(Corollary~\ref{cor:yurinskii_sa_martingale}) to
+reduce the problem to that of providing anti-concentration results for
+high-dimensional Gaussian vectors. We then demonstrate the utility of this
+reduction by employing a few such anti-concentration methods from the existing
+literature. Proposition~\ref{pro:yurinskii_app_bootstrap} gives a feasible
+implementation via
+the Gaussian multiplier bootstrap, enabling valid
+resampling-based inference using
+the resulting conditional Gaussian distribution. Finally, in
+Section~\ref{sec:yurinskii_app_lp} we provide an example application:
+distributional
+approximation for $\ell^p$-norms of high-dimensional martingale vectors
+in Kolmogorov--Smirnov distance, relying on some recent results
+concerning Gaussian perimetric inequalities
+\citep{nazarov2003maximal,kozbur2021dimension,
+giessing2023anti,chernozhukov2017detailed}.
+
+We begin this section with some notation. Assume the setup of
+Corollary~\ref{cor:yurinskii_sa_martingale} and suppose $\Sigma$ is
+non-random. Let $\cA$ be a class of measurable subsets of
+$\R^d$ and take $T \sim \cN(0, \Sigma)$.
+For $\eta>0$ and $p \in [1, \infty]$ define the Gaussian perimetric quantity
+%
+\begin{align*}
+\Delta_p(\cA, \eta)
+&=
+\sup_{A\in \cA}
+\big\{\P(T\in A_p^\eta\setminus A)
+\vee \P(T\in A \setminus A_p^{-\eta})\big\},
+\end{align*}
+%
+where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$,
+$A_p^{-\eta} = \R^d \setminus (\R^d \setminus A)_p^\eta$,
+and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$.
+Using this perimetric term allows us to convert coupling results
+to central limit theorems as follows.
+Denote by $\Gamma_p(\eta)$ the rate of strong approximation attained in
+Corollary~\ref{cor:yurinskii_sa_martingale}:
+%
+\begin{align*}
+\Gamma_p(\eta)
+&=
+24 \left(
+\frac{\beta_{p,2} \phi_p(d)^2}{\eta^3}
+\right)^{1/3}
++ 17 \left(
+\frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2}
+\right)^{1/3}.
+\end{align*}
+
+\begin{proposition}[High-dimensional central limit theorem for martingales]%
+\label{pro:yurinskii_app_clt}
+
+Take the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
+and $\Sigma$ non-random.
+For a class $\cA$ of measurable sets in $\R^d$,
+%
+\begin{equation}%
+\label{eq:yurinskii_app_high_dim_clt}
+\sup_{A\in \cA}
+\big|\P(S\in A) -\P(T\in A)\big|
+\leq \inf_{p \in [1, \infty]} \inf_{\eta>0}
+\big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}.
+\end{equation}
+\end{proposition}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_app_clt}]
+
+This follows from Strassen's theorem
+(Lemma~\ref{lem:yurinskii_app_strassen}), but we
+provide a proof for completeness.
+%
+\begin{align*}
+\P(S \in A)
+&\leq
+\P(T \in A)
++ \P(T \in A_p^\eta \setminus A)
++ \P(\|S - T\| > \eta)
+\end{align*}
+%
+and applying this to $\R^d \setminus A$ gives
+%
+\begin{align*}
+\P(S\in A)
+&=
+1 - \P(S\in \R^d \setminus A) \\
+&\geq
+1 - \P(T \in \R^d \setminus A)
+- \P(T \in (\R^d \setminus A)_p^\eta \setminus (\R^d \setminus A))
+- \P(\|S - T\| > \eta) \\
+&=
+\P(T \in A)
+- \P(T \in A \setminus A_p^{-\eta})
+- \P(\|S - T\| > \eta).
+\end{align*}
+%
+Since this holds for all $p \in [1, \infty]$,
+%
+\begin{align*}
+\sup_{A\in \cA}
+\big|\P(S\in A) -\P(T\in A)\big|
+&\leq
+\sup_{A \in \cA}
+\big\{\P(T \in A_p^\eta\setminus A)
+\vee \P(T \in A \setminus A_p^{-\eta})\big\}
++ \P(\|S - T\| > \eta) \\
+&\leq
+\inf_{p \in [1, \infty]} \inf_{\eta>0}
+\big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}.
+\end{align*}
+%
+\end{proof}
+
+The term $\Delta_p(\cA, \eta)$
+in \eqref{eq:yurinskii_app_high_dim_clt} is a Gaussian anti-concentration
+quantity
+so it depends on the law of $S$ only through the covariance matrix $\Sigma$.
+A few results are available in the literature
+for bounding this term.
+For instance, with
+$\cA = \cC = \{A \subseteq \R^d \text{ is convex}\}$,
+\citet{nazarov2003maximal} showed
+%
+\begin{equation}%
+\label{eq:yurinskii_app_convex_anticonc}
+\Delta_2(\cC, \eta)
+\asymp
+\eta\sqrt{\|\Sigma^{-1}\|_{\rF}},
+\end{equation}
+%
+whenever $\Sigma$ is invertible.
+Proposition~\ref{pro:yurinskii_app_clt} with $p=2$
+and \eqref{eq:yurinskii_app_convex_anticonc} yield for convex sets
+%
+\begin{align*}
+\sup_{A\in \cC}
+\big|\P(S\in A) -\P(T\in A)\big|
+&\lesssim
+\inf_{\eta > 0}
+\left\{
+\left(\frac{\beta_{p,2} d}{\eta^3}\right)^{1/3}
++ \left(\frac{\E[\|\Omega \|_2] d}{\eta^2}\right)^{1/3}
++ \eta \sqrt{\|\Sigma^{-1}\|_\rF}
+\right\}.
+\end{align*}
+
+Alternatively, one can take $\cA = \cR$,
+the class of axis-aligned rectangles in $\R^d$.
+By Nazarov's Gaussian perimetric inequality
+\citep{nazarov2003maximal,chernozhukov2017central},
+%
+\begin{align}%
+\label{eq:yurinskii_app_rect_anticonc}
+\Delta_\infty(\cR, \eta)
+\leq \frac{\eta (\sqrt{2\log d} + 2)}{\sigma_{\min}}
+\end{align}
+%
+whenever $\min_j \, \Sigma_{j j} \geq \sigma_{\min}^2$
+for some $\sigma_{\min}>0$.
+Proposition~\ref{pro:yurinskii_app_clt} with $p = \infty$
+and \eqref{eq:yurinskii_app_rect_anticonc} yields
+%
+\begin{align*}%
+&\sup_{A\in \cR}
+\big|\P(S\in A) -\P(T\in A)\big|
+\lesssim
+\inf_{\eta > 0}
+\left\{
+\left(\frac{\beta_{\infty,2} \log 2d}{\eta^3}\right)^{1/3}
++ \left(\frac{\E[\|\Omega \|_2] \log 2d}{\eta^2}\right)^{1/3}
++ \frac{\eta \sqrt{\log 2d}}{\sigma_{\min}}
+\right\}.
+\end{align*}
+%
+In situations where
+$\liminf_n \min_j \, \Sigma_{j j} = 0$,
+it may be possible in certain cases to regularize
+the minimum variance away from zero and then apply
+a Gaussian--Gaussian rectangular approximation result
+such as Lemma~2.1 from \citet{chernozhukov2023nearly}.
+
+\begin{remark}[Comparisons with the literature]
+
+The literature on high-dimensional central limit theorems
+has developed rapidly in recent years
+\citep[see][and references therein]{%
+zhai2018high,%
+koike2021notes,%
+buzun2022strong,%
+lopes2022central,%
+chernozhukov2023nearly%
+},
+particularly for the special case of
+sums of independent random vectors
+on the rectangular sets $\cR$.
+%
+Our corresponding results are rather weaker in terms of
+dependence on the dimension than for example
+\citet[Theorem~2.1]{chernozhukov2023nearly}.
+This is an inherent issue due to our approach of first
+considering the class of all Borel sets
+and only afterwards specializing to the smaller class $\cR$,
+where sharper results in the literature directly target the
+Kolmogorov--Smirnov distance via Stein's method and Slepian interpolation.
+\end{remark}
+
+Next, we present a version of Proposition~\ref{pro:yurinskii_app_clt} in which
+the covariance
+matrix $\Sigma$ is replaced by an estimator $\hat \Sigma$. This ensures that
+the associated conditionally Gaussian vector is feasible and can be resampled,
+allowing Monte Carlo quantile estimation via a Gaussian
+multiplier bootstrap.
+
+\begin{proposition}[Bootstrap central limit theorem for martingales]%
+\label{pro:yurinskii_app_bootstrap}
+
+Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
+with $\Sigma$ non-random,
+and let $\hat \Sigma$ be an $\bX$-measurable random
+$d \times d$ positive semi-definite matrix,
+where $\bX = (X_1, \ldots, X_n)$.
+For a class $\cA$ of measurable subsets of $\R^d$,
+%
+\begin{align*}
+&\sup_{A\in \cA}
+\left|
+\P\big(S \in A\big)
+- \P\big(\hat \Sigma^{1/2} Z \in A \bigm| \bX \big)
+\right| \\
+&\quad\leq
+\inf_{p \in [1,\infty]} \inf_{\eta>0}
+\left\{ \Gamma_p(\eta) + 2 \Delta_p(\cA, \eta)
++ 2d \exp\left(\frac{-\eta^2}
+{2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}
+\right)
+\right\},
+\end{align*}
+%
+where $Z \sim \cN(0,I_d)$ is independent of $\bX$.
+\end{proposition}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_app_bootstrap}]
+
+Since $T = \Sigma^{1/2} Z$ is independent of $\bX$,
+%
+\begin{align*}
+&\left|
+\P\big(S \in A\big)
+- \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
+\right| \\
+&\quad\leq
+\left|
+\P\big(S \in A\big)
+- \P\big(T \in A\big)
+\right|
++\left|
+\P\big(\Sigma^{1/2} Z \in A\big)
+- \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
+\right|.
+\end{align*}
+%
+The first term is bounded by Proposition~\ref{pro:yurinskii_app_clt};
+the second by Lemma~\ref{lem:yurinskii_app_feasible_gaussian}
+conditional on $\bX$.
+%
+\begin{align*}
+&\left|
+\P\big(S \in A\big)
+- \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right)
+\right| \\
+&\quad\leq
+\Gamma_p(\eta) + \Delta_p(\cA, \eta)
++ \Delta_{p'}(\cA, \eta')
++ 2 d \exp \left( \frac{-\eta'^2}
+{2 d^{2/p'} \big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}
+\right)
+\end{align*}
+%
+for all $A \in \cA$
+and any $p, p' \in [1, \infty]$ and $\eta, \eta' > 0$.
+Taking a supremum over $A$ and infima over
+$p = p'$ and $\eta = \eta'$ yields the result.
+We do not need
+$p = p'$ and $\eta = \eta'$ in general.
+%
+\end{proof}
+
+A natural choice for $\hat\Sigma$ in certain situations is the sample
+covariance matrix $\sum_{i=1}^n X_i X_i^\T$, or a correlation-corrected variant
+thereof. In general, whenever $\hat \Sigma$ does not depend on unknown
+quantities, one can sample from the law of $\hat T = \hat\Sigma^{1/2} Z$
+conditional on $\bX$ to approximate the distribution of $S$.
+Proposition~\ref{pro:yurinskii_app_bootstrap} verifies that this Gaussian
+multiplier
+bootstrap approach is valid whenever $\hat\Sigma$ and $\Sigma$ are sufficiently
+close. To this end, Theorem~X.1.1 in \citet{bhatia1997matrix} gives
+$\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2
+\leq \big\|\hat\Sigma - \Sigma\big\|_2^{1/2}$
+and Problem~X.5.5 in the same gives
+$\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2
+\leq \big\|\Sigma^{-1/2}\big\|_2 \big\|\hat\Sigma - \Sigma\big\|_2$
+when $\Sigma$ is invertible. The latter often gives a tighter bound when the
+minimum eigenvalue of $\Sigma$ can be bounded away from zero, and consistency
+of $\hat \Sigma$ can be established using a range of matrix concentration
+inequalities.
+
+In Section~\ref{sec:yurinskii_app_lp} we apply
+Proposition~\ref{pro:yurinskii_app_clt} to the special case
+of approximating the distribution of the $\ell^p$-norm of a high-dimensional
+martingale. Proposition~\ref{pro:yurinskii_app_bootstrap} is then used to
+ensure that
+feasible distributional approximations are also available.
+
+\subsection{Application: distributional approximation of martingale
+\texorpdfstring{$\ell^p$}{lp}-norms}
+\label{sec:yurinskii_app_lp}
+
+In empirical applications,
+including nonparametric significance tests
+\citep{lopes2020bootstrapping}
+and nearest neighbor search procedures
+\citep{biau2015high},
+an estimator or test statistic
+can be expressed under the null hypothesis
+as the $\ell^p$-norm of a zero-mean
+martingale for some $p \in [1, \infty]$.
+In the notation of Corollary~\ref{cor:yurinskii_sa_martingale},
+it is of interest to bound Kolmogorov--Smirnov
+quantities of the form
+$\sup_{t \geq 0} \big| \P( \|S\|_p \leq t) - \P( \|T\|_p \leq t) \big|$.
+Let $\cB_p$ be the class of closed $\ell^p$-balls in $\R^d$ centered at the
+origin and set
+$\Delta_p(\eta) \vcentcolon= \Delta_p(\cB_p, \eta)
+= \sup_{t \geq 0} \P( t < \|T\|_p \leq t + \eta )$.
+
+\begin{proposition}[Distributional approximation of
+martingale $\ell^p$-norms]
+\label{pro:yurinskii_app_application_lp}
+
+Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale},
+with $\Sigma$ non-random. Then for $T \sim \cN(0, \Sigma)$,
+%
+\begin{equation}%
+\label{eq:yurinskii_app_application_lp}
+\sup_{t \geq 0}
+\big| \P( \|S\|_p \leq t )
+- \P\left( \|T\|_p \leq t \right) \big|
+\leq \inf_{\eta>0}
+\big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}.
+\end{equation}
+%
+\end{proposition}
+
+\begin{proof}[Proposition~\ref{pro:yurinskii_app_application_lp}]
+
+Applying Proposition~\ref{pro:yurinskii_app_clt}
+with $\cA=\cB_p$ gives
+%
+\begin{align*}
+\sup_{t \geq 0}
+\big| \P( \|S\|_p \leq t )
+- \P\left( \|T\|_p \leq t \right) \big|
+&= \sup_{A\in \cB_p}
+\big|\P(S\in A) -\P(T\in A)\big| \\
+&\leq
+\inf_{\eta>0}
+\big\{\Gamma_p(\eta) + \Delta_p(\cB_p, \eta) \big\}
+\leq
+\inf_{\eta>0}
+\big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}.
+\end{align*}
+%
+\end{proof}
+
+The right-hand side of
+\eqref{eq:yurinskii_app_application_lp} can be controlled in various ways.
+%
+In the case of $p=\infty$,
+note that $\ell^\infty$-balls are rectangles so
+$\cB_\infty\subseteq \cR$
+and \eqref{eq:yurinskii_app_rect_anticonc} applies, giving
+$\Delta_\infty(\eta) \leq \eta (\sqrt{2\log d} + 2) / \sigma_{\min}$
+whenever $\min_j \Sigma_{j j} \geq \sigma_{\min}^2$.
+Alternatively, \citet[Theorem~1]{giessing2023anti} provides
+$\Delta_\infty(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_\infty] + \eta^2}$.
+By H{\"o}lder duality of $\ell^p$-norms, we can write
+$\|T\|_p = \sup_{\|u\|_q \leq 1} u^\T T$ where $1/p + 1/q = 1$.
+Applying the Gaussian process anti-concentration result of
+\citet[Theorem~2]{giessing2023anti} yields the more general
+$\Delta_p(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_p] + \eta^2}$.
+Thus, the problem can be reduced to that of bounding
+$\Var\left[\|T\|_p\right]$, with techniques for doing so
+discussed in \citet[Section~4]{giessing2023anti}.
+Alongside the $\ell^p$-norms, other functionals can be analyzed in this manner,
+including the maximum and other order statistics
+\citep{kozbur2021dimension,giessing2023anti}.
+
+To conduct inference in this setting, we must feasibly
+approximate the quantiles of $\|T\|_p$.
+To that end, take a significance level $\tau\in(0,1)$ and set
+%
+$\hat q_p(\tau) =
+\inf \big\{t \in \R: \P(\|\hat T\|_p \leq t \mid \bX) \geq \tau \}$
+where $\hat T \mid \bX \sim \cN(0, \hat\Sigma)$,
+%
+with $\hat\Sigma$ any $\bX$-measurable positive semi-definite
+estimator of $\Sigma$.
+Note that for the canonical estimator $\hat\Sigma = \sum_{i=1}^n X_i X_i^\T$
+we can write $\hat T =\sum_{i=1}^n X_i Z_i$ with
+$Z_1,\dots,Z_n$ i.i.d.\ standard Gaussian independent of $\bX$,
+yielding the Gaussian multiplier bootstrap.
+Now assuming
+the law of $\|\hat T\|_p \mid \bX$ has no atoms,
+we can apply Proposition~\ref{pro:yurinskii_app_bootstrap}
+to see
+%
+\begin{align*}
+&\sup_{\tau\in(0,1)}
+\big|\P\left(\|S\|_p \leq \hat q_p(\tau)\right) - \tau \big|
+\leq
+\E\left[
+\sup_{t \geq 0}
+\big|
+\P(\|S\|_p \leq t)
+- \P(\|\hat T\|_p \leq t \mid \bX)
+\big|
+\right] \\
+&\qquad\leq
+\inf_{\eta>0}
+\left\{ \Gamma_p(\eta)
++ 2 \Delta_p(\eta)
++ 2d\, \E\left[
+\exp\left(\frac{-\eta^2}
+{2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}\right)
+\right]
+\right\},
+\end{align*}
+%
+and hence the bootstrap is valid whenever
+$\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2$ is sufficiently small. See the
+preceding discussion regarding methods for bounding this object.
+
+\begin{remark}[One-dimensional distributional approximations]
+In our application to distributional approximation of $\ell^p$-norms,
+the object of interest $\|S\|_p$ is a
+one-dimensional functional of the high-dimensional martingale;
+contrast this with the more general Proposition~\ref{pro:yurinskii_app_clt}
+which
+directly considers the $d$-dimensional random vector $S$.
+As such, our coupling-based approach may be improved in certain settings
+by applying a more carefully tailored smoothing argument.
+For example, \citet{belloni2018high}
+employ a ``log sum exponential'' bound
+\citep[see also][]{chernozhukov2013gaussian}
+for the maximum statistic
+$\max_{1 \leq j \leq d} S_j$
+along with a coupling due to \citet{chernozhukov2014gaussian} to attain
+an improved dependence on the dimension.
+Naturally, their approach does not permit the formulation of
+high-dimensional central limit theorems over arbitrary classes of
+Borel sets as in our Proposition~\ref{pro:yurinskii_app_clt}.
+\end{remark}
+
+\clearpage
+\addcontentsline{toc}{chapter}{Bibliography}
+\bibliographystyle{phd_dissertation}
+\bibliography{refs}
+
+\end{document}
diff --git a/tests/example3_out.tex b/tests/phd_dissertation_out.tex
similarity index 100%
rename from tests/example3_out.tex
rename to tests/phd_dissertation_out.tex
diff --git a/tests/example4_in.tex b/tests/readme_in.tex
similarity index 100%
rename from tests/example4_in.tex
rename to tests/readme_in.tex
diff --git a/tests/example4_out.tex b/tests/readme_out.tex
similarity index 100%
rename from tests/example4_out.tex
rename to tests/readme_out.tex
diff --git a/tests/short_document_in.tex b/tests/short_document_in.tex
new file mode 100644
index 0000000..d6019bd
--- /dev/null
+++ b/tests/short_document_in.tex
@@ -0,0 +1,58 @@
+\documentclass{article}
+
+\usepackage{amsmath}
+\usepackage{amsthm}
+
+\newtheorem{theorem}{Theorem}
+
+\title{Testing \texttt{texfmt}}
+\author{William G.\ Underwood}
+\begin{document}
+\maketitle
+
+\begin{align}
+E = m c^2 \\
+1 + 2
++ (3 + 4)
++ (5 + 6
++ 7 + 8)
++ (9 + 10
++ 11 + 12
++ 13 + 14)
+\end{align}
+
+\begin{itemize}
+\item Item one % trailing comment with ]) brackets
+\item Item two on
+multiple lines
+\item
+\item Item three
+\begin{itemize}
+\item Subitem one of item two
+\item Subitem two of item two
+\end{itemize}
+\item Item four % trailing comment with [( brackets
+\end{itemize}
+
+\begin{theorem}[Pythagoras]%
+\label{thm:pythagoras}
+
+For a right triangle with hypotenuse $c$ and other sides $a$ and $b$,
+we have
+%
+\begin{align*}
+a^2 + b^2 = c^2
+\end{align*}
+%
+% some comments
+
+\end{theorem}
+
+This line contains \emph{emphasized} text.
+\emph{This line contains only emphasized text,
+and is broken over two lines}.
+\emph{This line contains only
+emphasized text,
+and is broken over three lines}.
+
+\end{document}
diff --git a/tests/example1_in.tex b/tests/short_document_out.tex
similarity index 100%
rename from tests/example1_in.tex
rename to tests/short_document_out.tex

From b5e5d935d916f1c3b16fdcf0d8d10053ddb9c467 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 17:35:12 -0400
Subject: [PATCH 07/13] Working on new main structure

---
 notes.org                    |  48 ++++++------
 src/indent.rs                |  73 ++++++++++++++++++
 src/main.rs                  | 143 +++++++----------------------------
 src/regexes.rs               |  33 ++++++++
 src/tests.rs                 |  19 ++---
 tests/short_document_in.tex  |   4 +-
 tests/short_document_out.tex |   4 +-
 7 files changed, 172 insertions(+), 152 deletions(-)
 create mode 100644 src/indent.rs
 create mode 100644 src/regexes.rs

diff --git a/notes.org b/notes.org
index c4e43d4..30e01ba 100644
--- a/notes.org
+++ b/notes.org
@@ -1,31 +1,35 @@
 #+title: tex-fmt
 * Tests
 ** One feature per file
-** Benchmark running all tests
-*** Give number of files and total line numbers
-** Look for problem cases in other documents
+** Look for problem cases in other latex documents
 * Features
-** Commands on own lines
-*** Implement this by regex on whole document at start
-*** Care about multiple or long arguments
-*** Look into bracket matching
-*** \begin and \end
-*** \section, \section*, \chapter, \subsection etc
-*** \author and \title etc
-*** \usepackage and \documentclass
-*** \label
-*** \centering
-*** \includegraphics
-*** \caption
-*** \newpage
-*** \appendix
-** Fold long lines
+** Begin and end document on own line
+** Environments should start and end on new lines
+*** New line before begin/end with regex replace
+** Fold long lines to 80 characters
+*** Care with trailing comments
+*** No folding in verbatim environments, just warn
+** Merge short alphanumeric lines
 ** Flags
-*** Dry run (do not modify files)
-*** Print to STDOUT
-*** Debug (display log file)
+*** -d Dry run
+*** -p Print to STDOUT
+*** -v Info verbose
+*** -vv Debug verbose
 ** Log file
+** Non-negativity and end-of-file checks apply per indenter type
+*** Care with half-open intervals like [0,1)
+*** Just use ignore comments for these cases
+** Ignore some source lines
+*** Line-by-line ignore
+*** Block ignore
 * Bugs
-** Better errors including line numbers in source file
 ** Check multiple begins or ends on the same line
+*** This should never happen once new lines are implemented
+*** Still should implement counting for cumulative and back
+** Better errors including line numbers in source file
 ** Check begins or ends with more brackets on the same line
+* Structure
+** Perform indenting
+** While long lines are present
+*** Trim long lines
+*** Perform indenting
diff --git a/src/indent.rs b/src/indent.rs
new file mode 100644
index 0000000..99cdf77
--- /dev/null
+++ b/src/indent.rs
@@ -0,0 +1,73 @@
+pub mod indent {
+
+    use core::cmp::max;
+    use crate::regexes::regexes::*;
+
+    const OPENS: [char; 3] = ['(', '[', '{'];
+    const CLOSES: [char; 3] = [')', ']', '}'];
+
+    #[derive(Debug)]
+    pub struct Indent {
+        /// actual running indentation count at end of current line
+        pub actual: i8,
+        /// visual indentation of current line
+        pub visual: i8,
+    }
+
+    /// calculate total indentation change due to current line
+    pub fn get_diff(line: &str) -> i8 {
+        if RE_DOCUMENT_BEGIN.is_match(line) || RE_DOCUMENT_END.is_match(line) {
+            return 0;
+        };
+
+        let mut diff: i8 = 0;
+        if RE_ENV_BEGIN.is_match(line) {
+            diff += 1
+        };
+        if RE_ENV_END.is_match(line) {
+            diff -= 1
+        };
+        if RE_ENV_BEGIN.is_match(line) {
+            diff += 1
+        };
+        if RE_ENV_END.is_match(line) {
+            diff -= 1
+        };
+
+        for c in OPENS {
+            diff += line.chars().filter(|&x| x == c).count() as i8;
+        }
+        for c in CLOSES {
+            diff -= line.chars().filter(|&x| x == c).count() as i8;
+        }
+        diff
+    }
+
+    /// calculate dedentation for current line compared to previous
+    pub fn get_back(line: &str) -> i8 {
+        if RE_DOCUMENT_END.is_match(line) {
+            return 0;
+        };
+
+        if RE_ENV_END.is_match(line) {
+            return 1;
+        };
+
+        let mut back: i8 = 0;
+        let mut cumul: i8 = 0;
+        for c in line.chars() {
+            cumul -= OPENS.contains(&c) as i8;
+            cumul += CLOSES.contains(&c) as i8;
+            back = max(cumul, back);
+        }
+        back
+    }
+
+    pub fn get_indent(line: &str, prev_indent: Indent) -> Indent {
+        let diff = get_diff(line);
+        let back = get_back(line);
+        let actual = prev_indent.actual + diff;
+        let visual: i8 = prev_indent.actual - back;
+        Indent{actual, visual}
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index c70a7d7..4244737 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,11 @@
 use clap::Parser;
-use core::cmp::max;
-use lazy_static::lazy_static;
-use regex::Regex;
+//use lazy_static::lazy_static;
+//use regex::Regex;
 use std::env::temp_dir;
 use std::fs;
 use std::path;
 
-const TAB: i32 = 2;
-const OPENS: [char; 3] = ['(', '[', '{'];
-const CLOSES: [char; 3] = [')', ']', '}'];
-const LISTS: [&str; 4] = ["itemize", "enumerate", "description", "inlineroman"];
+const TAB: i8 = 2;
 
 const YELLOW: &str = "\x1b[33m\x1b[1m";
 const PINK: &str = "\x1b[35m\x1b[1m";
@@ -29,30 +25,8 @@ struct Cli {
     filenames: Vec<String>,
 }
 
-lazy_static! {
-    static ref RE_NEWLINES: Regex = Regex::new(r"\n\n\n+").unwrap();
-    static ref RE_TABS: Regex = Regex::new(r"\t").unwrap();
-    static ref RE_TRAIL: Regex = Regex::new(r" +\n").unwrap();
-    static ref RE_PERCENT: Regex = Regex::new(r"\\\%").unwrap();
-    static ref RE_COMMENT: Regex = Regex::new(r"\%.*").unwrap();
-    static ref RE_ITEM: Regex = Regex::new(r".*\\item.*").unwrap();
-    static ref RE_DOCUMENT_BEGIN: Regex =
-        Regex::new(r".*\\begin\{document\}.*").unwrap();
-    static ref RE_DOCUMENT_END: Regex =
-        Regex::new(r".*\\end\{document\}.*").unwrap();
-    static ref RE_ENV_BEGIN: Regex =
-        Regex::new(r".*\\begin\{[a-z\*]*\}.*").unwrap();
-    static ref RE_ENV_END: Regex =
-        Regex::new(r".*\\end\{[a-z\*]*\}.*").unwrap();
-    static ref RE_LISTS_BEGIN: Vec<Regex> = LISTS
-        .iter()
-        .map(|l| Regex::new(&format!(r".*\\begin\{{{}}}.*", l)).unwrap())
-        .collect();
-    static ref RE_LISTS_END: Vec<Regex> = LISTS
-        .iter()
-        .map(|l| Regex::new(&format!(r".*\\end\{{{}}}.*", l)).unwrap())
-        .collect();
-}
+pub mod regexes;
+use crate::regexes::regexes::*;
 
 fn remove_extra_newlines(file: &str) -> String {
     RE_NEWLINES.replace_all(file, "\n\n").to_string()
@@ -72,79 +46,8 @@ fn remove_comment(line: &str) -> String {
     RE_COMMENT.replace_all(&new_line, "").to_string()
 }
 
-fn get_back(line: &str) -> i32 {
-    // no deindentation for ending document
-    if RE_DOCUMENT_END.is_match(line) {
-        return 0;
-    };
-
-    // list environments get double indents for indenting items
-    for re_list_end in RE_LISTS_END.iter() {
-        if re_list_end.is_match(line) {
-            return 2;
-        };
-    }
-
-    // other environments get single indents
-    if RE_ENV_END.is_match(line) {
-        return 1;
-    };
-
-    // deindent items to make the rest of item environment appear indented
-    if RE_ITEM.is_match(line) {
-        return 1;
-    };
-
-    let mut back: i32 = 0;
-    let mut cumul: i32 = 0;
-    for c in line.chars() {
-        cumul -= OPENS.contains(&c) as i32;
-        cumul += CLOSES.contains(&c) as i32;
-        back = max(cumul, back);
-    }
-    back
-}
-
-fn get_diff(line: &str) -> i32 {
-    // no indentation for document
-    if RE_DOCUMENT_BEGIN.is_match(line) {
-        return 0;
-    };
-    if RE_DOCUMENT_END.is_match(line) {
-        return 0;
-    };
-
-    // list environments get double indents
-    let mut diff: i32 = 0;
-    for re_list_begin in RE_LISTS_BEGIN.iter() {
-        if re_list_begin.is_match(line) {
-            diff += 1
-        };
-    }
-
-    for re_list_end in RE_LISTS_END.iter() {
-        if re_list_end.is_match(line) {
-            diff -= 1
-        };
-    }
-
-    // other environments get single indents
-    if RE_ENV_BEGIN.is_match(line) {
-        diff += 1
-    };
-    if RE_ENV_END.is_match(line) {
-        diff -= 1
-    };
-
-    // delimiters
-    for c in OPENS {
-        diff += line.chars().filter(|&x| x == c).count() as i32;
-    }
-    for c in CLOSES {
-        diff -= line.chars().filter(|&x| x == c).count() as i32;
-    }
-    diff
-}
+mod indent;
+use crate::indent::indent::*;
 
 fn format_file(file: String, debug: bool) -> String {
     // preformat
@@ -154,9 +57,9 @@ fn format_file(file: String, debug: bool) -> String {
     let lines: Vec<&str> = new_file.lines().collect();
 
     // set up variables
-    let mut count: i32 = 0;
+    //let mut count: i8 = 0;
     let n_lines = lines.len();
-    let mut indents: Vec<i32> = vec![0; lines.len()];
+    let mut indent = Indent{actual: 0, visual: 0, item: 0};
     let mut new_lines = vec!["".to_owned(); n_lines];
 
     // main loop through file
@@ -164,19 +67,25 @@ fn format_file(file: String, debug: bool) -> String {
         // calculate indent
         let line = lines[i];
         let line_strip = &remove_comment(line);
-        let back = get_back(line_strip);
-        let diff = get_diff(line_strip);
-        let indent: i32 = count - back;
+        indent = get_indent(line_strip, indent);
+        //let back = get_back(line_strip);
+        //let diff = get_diff(line_strip);
+        //let indent: i8 = count - back;
         if !debug {
-            assert!(indent >= 0)
+            dbg!(&line);
+            dbg!(&indent.actual);
+            dbg!(&indent.visual);
+            dbg!();
+            //assert!(indent.actual >= 0, "line {}", i);
+            //assert!(indent.visual >= 0, "line {}", i);
         };
-        indents[i] = indent;
-        count += diff;
+        //indents[i] = indent;
+        //count += diff;
 
         // apply indent
         let mut new_line = line.trim_start().to_string();
         if !new_line.is_empty() {
-            let n_spaces = indents[i] * TAB;
+            let n_spaces = indent.visual * TAB;
             let spaces: String = (0..n_spaces).map(|_| " ").collect();
             new_line.insert_str(0, &spaces);
         }
@@ -184,10 +93,9 @@ fn format_file(file: String, debug: bool) -> String {
     }
 
     // check indents return to zero
-    if !debug {
-        assert!(indents.first().unwrap() == &0);
-        assert!(indents.last().unwrap() == &0);
-    }
+    //if !debug {
+        //assert!(indent.current == &0);
+    //}
 
     // prepare indented file
     let mut new_file = new_lines.join("\n");
@@ -211,6 +119,7 @@ fn main() {
     // check files are in correct format
     assert!(filenames.iter().all(|f| f.ends_with(".tex")
         || f.ends_with(".bib")
+        || f.ends_with(".sty")
         || f.ends_with(".cls")));
 
     // print script name
diff --git a/src/regexes.rs b/src/regexes.rs
new file mode 100644
index 0000000..716f5d0
--- /dev/null
+++ b/src/regexes.rs
@@ -0,0 +1,33 @@
+pub mod regexes {
+
+    use lazy_static::lazy_static;
+    use regex::Regex;
+
+    const LISTS: [&str; 4] = ["itemize", "enumerate", "description", "inlineroman"];
+
+    lazy_static! {
+        pub static ref RE_NEWLINES: Regex = Regex::new(r"\n\n\n+").unwrap();
+        pub static ref RE_TABS: Regex = Regex::new(r"\t").unwrap();
+        pub static ref RE_TRAIL: Regex = Regex::new(r" +\n").unwrap();
+        pub static ref RE_PERCENT: Regex = Regex::new(r"\\\%").unwrap();
+        pub static ref RE_COMMENT: Regex = Regex::new(r"\%.*").unwrap();
+        pub static ref RE_ITEM: Regex = Regex::new(r".*\\item.*").unwrap();
+        pub static ref RE_DOCUMENT_BEGIN: Regex =
+            Regex::new(r".*\\begin\{document\}.*").unwrap();
+        pub static ref RE_DOCUMENT_END: Regex =
+            Regex::new(r".*\\end\{document\}.*").unwrap();
+        pub static ref RE_ENV_BEGIN: Regex =
+            Regex::new(r".*\\begin\{[a-z\*]*\}.*").unwrap();
+        pub static ref RE_ENV_END: Regex =
+            Regex::new(r".*\\end\{[a-z\*]*\}.*").unwrap();
+        static ref RE_LISTS_BEGIN: Vec<Regex> = LISTS
+            .iter()
+            .map(|l| Regex::new(&format!(r".*\\begin\{{{}}}.*", l)).unwrap())
+            .collect();
+        static ref RE_LISTS_END: Vec<Regex> = LISTS
+            .iter()
+            .map(|l| Regex::new(&format!(r".*\\end\{{{}}}.*", l)).unwrap())
+            .collect();
+    }
+
+}
diff --git a/src/tests.rs b/src/tests.rs
index 8e0eb84..f170fe5 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -37,14 +37,15 @@ mod tests {
 
     #[test]
     fn test_files() {
-        let filenames: Vec<String> = fs::read_dir("tests/")
-            .unwrap()
-            .map(|f| f.unwrap().file_name().into_string().unwrap())
-            .filter(|f| f.ends_with("_in.tex"))
-            .map(|f| f.strip_suffix("_in.tex").unwrap().to_string())
-            .collect();
-        for filename in filenames {
-            test_file(&filename);
-        }
+        test_file("readme");
+        //let filenames: Vec<String> = fs::read_dir("tests/")
+            //.unwrap()
+            //.map(|f| f.unwrap().file_name().into_string().unwrap())
+            //.filter(|f| f.ends_with("_in.tex"))
+            //.map(|f| f.strip_suffix("_in.tex").unwrap().to_string())
+            //.collect();
+        //for filename in filenames {
+            //test_file(&filename);
+        //}
     }
 }
diff --git a/tests/short_document_in.tex b/tests/short_document_in.tex
index d6019bd..8f99566 100644
--- a/tests/short_document_in.tex
+++ b/tests/short_document_in.tex
@@ -5,7 +5,7 @@
 
 \newtheorem{theorem}{Theorem}
 
-\title{Testing \texttt{texfmt}}
+\title{Testing \texttt{tex-fmt}}
 \author{William G.\ Underwood}
 \begin{document}
 \maketitle
@@ -25,13 +25,13 @@
 \item Item one % trailing comment with ]) brackets
 \item Item two on
 multiple lines
-\item
 \item Item three
 \begin{itemize}
 \item Subitem one of item two
 \item Subitem two of item two
 \end{itemize}
 \item Item four % trailing comment with [( brackets
+\item
 \end{itemize}
 
 \begin{theorem}[Pythagoras]%
diff --git a/tests/short_document_out.tex b/tests/short_document_out.tex
index 1f45a5a..8061ec6 100644
--- a/tests/short_document_out.tex
+++ b/tests/short_document_out.tex
@@ -5,7 +5,7 @@
 
 \newtheorem{theorem}{Theorem}
 
-\title{Testing \texttt{texfmt}}
+\title{Testing \texttt{tex-fmt}}
 \author{William G.\ Underwood}
 \begin{document}
 \maketitle
@@ -25,13 +25,13 @@
   \item Item one % trailing comment with ]) brackets
   \item Item two on
     multiple lines
-  \item
   \item Item three
     \begin{itemize}
       \item Subitem one of item two
       \item Subitem two of item two
     \end{itemize}
   \item Item four % trailing comment with [( brackets
+  \item
 \end{itemize}
 
 \begin{theorem}[Pythagoras]%

From 7ca17244eeab838e3855b454386ffdcc5d89f8bc Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 18:04:51 -0400
Subject: [PATCH 08/13] Breaking down main source code into modules

---
 .gitignore                         |   2 +
 shell.nix                          |   1 +
 src/indent.rs                      | 134 +++++++++++++++++------------
 src/main.rs                        |  57 ++++--------
 src/regexes.rs                     |  56 ++++++------
 src/subs.rs                        |  20 +++++
 src/tests.rs                       |  19 ++--
 tests/lists_in.tex                 |  15 ++++
 tests/lists_out.tex                |  15 ++++
 tests/long_lines_in.tex            |  34 ++++++++
 tests/long_lines_out.tex           |  34 ++++++++
 tests/masters_dissertation_in.tex  |   1 -
 tests/masters_dissertation_out.tex |   1 -
 tests/phd_dissertation_in.tex      |   1 -
 tests/phd_dissertation_out.tex     |   1 -
 tests/verbatim_in.tex              |  10 +++
 tests/verbatim_out.tex             |  10 +++
 17 files changed, 269 insertions(+), 142 deletions(-)
 create mode 100644 src/subs.rs
 create mode 100644 tests/lists_in.tex
 create mode 100644 tests/lists_out.tex
 create mode 100644 tests/long_lines_in.tex
 create mode 100644 tests/long_lines_out.tex
 create mode 100644 tests/verbatim_in.tex
 create mode 100644 tests/verbatim_out.tex

diff --git a/.gitignore b/.gitignore
index e6b8a01..d16fbe7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ target/
 result
 *.html
 *.log
+*.svg
+perf.data*
diff --git a/shell.nix b/shell.nix
index 5ea2a22..5765570 100644
--- a/shell.nix
+++ b/shell.nix
@@ -4,5 +4,6 @@ pkgs.mkShell {
   buildInputs = with pkgs; [
     rustfmt
     clippy
+    cargo-flamegraph
   ];
 }
diff --git a/src/indent.rs b/src/indent.rs
index 99cdf77..0fea4b3 100644
--- a/src/indent.rs
+++ b/src/indent.rs
@@ -1,73 +1,95 @@
-pub mod indent {
+use core::cmp::max;
+use crate::regexes::*;
 
-    use core::cmp::max;
-    use crate::regexes::regexes::*;
+const OPENS: [char; 3] = ['(', '[', '{'];
+const CLOSES: [char; 3] = [')', ']', '}'];
 
-    const OPENS: [char; 3] = ['(', '[', '{'];
-    const CLOSES: [char; 3] = [')', ']', '}'];
-
-    #[derive(Debug)]
-    pub struct Indent {
-        /// actual running indentation count at end of current line
-        pub actual: i8,
-        /// visual indentation of current line
-        pub visual: i8,
-    }
+#[derive(Debug)]
+pub struct Indent {
+    /// actual running indentation count at end of current line
+    pub actual: i8,
+    /// visual indentation of current line
+    pub visual: i8,
+}
 
-    /// calculate total indentation change due to current line
-    pub fn get_diff(line: &str) -> i8 {
-        if RE_DOCUMENT_BEGIN.is_match(line) || RE_DOCUMENT_END.is_match(line) {
-            return 0;
-        };
+/// calculate total indentation change due to current line
+pub fn get_diff(line: &str) -> i8 {
+    // documents get no global indentation
+    if RE_DOCUMENT_BEGIN.is_match(line) || RE_DOCUMENT_END.is_match(line) {
+        return 0;
+    };
 
-        let mut diff: i8 = 0;
-        if RE_ENV_BEGIN.is_match(line) {
+    // list environments get double indents
+    let mut diff: i8 = 0;
+    for re_list_begin in RE_LISTS_BEGIN.iter() {
+        if re_list_begin.is_match(line) {
             diff += 1
         };
-        if RE_ENV_END.is_match(line) {
-            diff -= 1
-        };
-        if RE_ENV_BEGIN.is_match(line) {
-            diff += 1
-        };
-        if RE_ENV_END.is_match(line) {
+    }
+    for re_list_end in RE_LISTS_END.iter() {
+        if re_list_end.is_match(line) {
             diff -= 1
         };
+    }
+
+    // other environments get single indents
+    if RE_ENV_BEGIN.is_match(line) {
+        diff += 1
+    };
+    if RE_ENV_END.is_match(line) {
+        diff -= 1
+    };
 
-        for c in OPENS {
-            diff += line.chars().filter(|&x| x == c).count() as i8;
-        }
-        for c in CLOSES {
-            diff -= line.chars().filter(|&x| x == c).count() as i8;
-        }
-        diff
+    // indent for delimiters
+    for c in OPENS {
+        diff += line.chars().filter(|&x| x == c).count() as i8;
+    }
+    for c in CLOSES {
+        diff -= line.chars().filter(|&x| x == c).count() as i8;
     }
 
-    /// calculate dedentation for current line compared to previous
-    pub fn get_back(line: &str) -> i8 {
-        if RE_DOCUMENT_END.is_match(line) {
-            return 0;
-        };
+    diff
+}
 
-        if RE_ENV_END.is_match(line) {
-            return 1;
-        };
+/// calculate dedentation for current line compared to previous
+pub fn get_back(line: &str) -> i8 {
+    // documents get no global indentation
+    if RE_DOCUMENT_END.is_match(line) {
+        return 0;
+    };
 
-        let mut back: i8 = 0;
-        let mut cumul: i8 = 0;
-        for c in line.chars() {
-            cumul -= OPENS.contains(&c) as i8;
-            cumul += CLOSES.contains(&c) as i8;
-            back = max(cumul, back);
-        }
-        back
+    // list environments get double indents for indenting items
+    for re_list_end in RE_LISTS_END.iter() {
+        if re_list_end.is_match(line) {
+            return 2;
+        };
     }
 
-    pub fn get_indent(line: &str, prev_indent: Indent) -> Indent {
-        let diff = get_diff(line);
-        let back = get_back(line);
-        let actual = prev_indent.actual + diff;
-        let visual: i8 = prev_indent.actual - back;
-        Indent{actual, visual}
+    // other environments get single indents
+    if RE_ENV_END.is_match(line) {
+        return 1;
+    };
+
+    // deindent items to make the rest of item environment appear indented
+    if RE_ITEM.is_match(line) {
+        return 1;
+    };
+
+
+    let mut back: i8 = 0;
+    let mut cumul: i8 = 0;
+    for c in line.chars() {
+        cumul -= OPENS.contains(&c) as i8;
+        cumul += CLOSES.contains(&c) as i8;
+        back = max(cumul, back);
     }
+    back
+}
+
+pub fn get_indent(line: &str, prev_indent: Indent) -> Indent {
+    let diff = get_diff(line);
+    let back = get_back(line);
+    let actual = prev_indent.actual + diff;
+    let visual: i8 = prev_indent.actual - back;
+    Indent{actual, visual}
 }
diff --git a/src/main.rs b/src/main.rs
index 4244737..576bfc5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,4 @@
 use clap::Parser;
-//use lazy_static::lazy_static;
-//use regex::Regex;
 use std::env::temp_dir;
 use std::fs;
 use std::path;
@@ -26,28 +24,12 @@ struct Cli {
 }
 
 pub mod regexes;
-use crate::regexes::regexes::*;
 
-fn remove_extra_newlines(file: &str) -> String {
-    RE_NEWLINES.replace_all(file, "\n\n").to_string()
-}
-
-fn remove_tabs(file: &str) -> String {
-    let replace = (0..TAB).map(|_| " ").collect::<String>();
-    RE_TABS.replace_all(file, replace).to_string()
-}
-
-fn remove_trailing_spaces(file: &str) -> String {
-    RE_TRAIL.replace_all(file, "\n").to_string()
-}
-
-fn remove_comment(line: &str) -> String {
-    let new_line = RE_PERCENT.replace_all(line, "").to_string();
-    RE_COMMENT.replace_all(&new_line, "").to_string()
-}
+mod subs;
+use crate::subs::*;
 
 mod indent;
-use crate::indent::indent::*;
+use crate::indent::*;
 
 fn format_file(file: String, debug: bool) -> String {
     // preformat
@@ -57,9 +39,8 @@ fn format_file(file: String, debug: bool) -> String {
     let lines: Vec<&str> = new_file.lines().collect();
 
     // set up variables
-    //let mut count: i8 = 0;
     let n_lines = lines.len();
-    let mut indent = Indent{actual: 0, visual: 0, item: 0};
+    let mut indent = Indent{actual: 0, visual: 0};
     let mut new_lines = vec!["".to_owned(); n_lines];
 
     // main loop through file
@@ -68,19 +49,10 @@ fn format_file(file: String, debug: bool) -> String {
         let line = lines[i];
         let line_strip = &remove_comment(line);
         indent = get_indent(line_strip, indent);
-        //let back = get_back(line_strip);
-        //let diff = get_diff(line_strip);
-        //let indent: i8 = count - back;
         if !debug {
-            dbg!(&line);
-            dbg!(&indent.actual);
-            dbg!(&indent.visual);
-            dbg!();
-            //assert!(indent.actual >= 0, "line {}", i);
-            //assert!(indent.visual >= 0, "line {}", i);
+            assert!(indent.actual >= 0, "line {}", i);
+            assert!(indent.visual >= 0, "line {}", i);
         };
-        //indents[i] = indent;
-        //count += diff;
 
         // apply indent
         let mut new_line = line.trim_start().to_string();
@@ -93,9 +65,10 @@ fn format_file(file: String, debug: bool) -> String {
     }
 
     // check indents return to zero
-    //if !debug {
-        //assert!(indent.current == &0);
-    //}
+    if !debug {
+        assert!(indent.actual == 0);
+        assert!(indent.visual == 0);
+    }
 
     // prepare indented file
     let mut new_file = new_lines.join("\n");
@@ -143,11 +116,11 @@ fn main() {
         } else {
             // backup original file
             let filepath = path::Path::new(&filename).canonicalize().unwrap();
-            let mut fileback = temp_dir();
-            fileback.push("tex-fmt");
-            fs::create_dir_all(&fileback).unwrap();
-            fileback.push(filepath.file_name().unwrap());
-            fs::copy(filepath.clone(), &fileback).unwrap();
+            let mut filebak = temp_dir();
+            filebak.push("tex-fmt");
+            fs::create_dir_all(&filebak).unwrap();
+            filebak.push(filepath.file_name().unwrap());
+            fs::copy(filepath.clone(), &filebak).unwrap();
 
             // write new file
             fs::write(filepath, new_file).unwrap();
diff --git a/src/regexes.rs b/src/regexes.rs
index 716f5d0..63a2009 100644
--- a/src/regexes.rs
+++ b/src/regexes.rs
@@ -1,33 +1,29 @@
-pub mod regexes {
+use lazy_static::lazy_static;
+use regex::Regex;
 
-    use lazy_static::lazy_static;
-    use regex::Regex;
-
-    const LISTS: [&str; 4] = ["itemize", "enumerate", "description", "inlineroman"];
-
-    lazy_static! {
-        pub static ref RE_NEWLINES: Regex = Regex::new(r"\n\n\n+").unwrap();
-        pub static ref RE_TABS: Regex = Regex::new(r"\t").unwrap();
-        pub static ref RE_TRAIL: Regex = Regex::new(r" +\n").unwrap();
-        pub static ref RE_PERCENT: Regex = Regex::new(r"\\\%").unwrap();
-        pub static ref RE_COMMENT: Regex = Regex::new(r"\%.*").unwrap();
-        pub static ref RE_ITEM: Regex = Regex::new(r".*\\item.*").unwrap();
-        pub static ref RE_DOCUMENT_BEGIN: Regex =
-            Regex::new(r".*\\begin\{document\}.*").unwrap();
-        pub static ref RE_DOCUMENT_END: Regex =
-            Regex::new(r".*\\end\{document\}.*").unwrap();
-        pub static ref RE_ENV_BEGIN: Regex =
-            Regex::new(r".*\\begin\{[a-z\*]*\}.*").unwrap();
-        pub static ref RE_ENV_END: Regex =
-            Regex::new(r".*\\end\{[a-z\*]*\}.*").unwrap();
-        static ref RE_LISTS_BEGIN: Vec<Regex> = LISTS
-            .iter()
-            .map(|l| Regex::new(&format!(r".*\\begin\{{{}}}.*", l)).unwrap())
-            .collect();
-        static ref RE_LISTS_END: Vec<Regex> = LISTS
-            .iter()
-            .map(|l| Regex::new(&format!(r".*\\end\{{{}}}.*", l)).unwrap())
-            .collect();
-    }
+const LISTS: [&str; 4] = ["itemize", "enumerate", "description", "inlineroman"];
 
+lazy_static! {
+    pub static ref RE_NEWLINES: Regex = Regex::new(r"\n\n\n+").unwrap();
+    pub static ref RE_TABS: Regex = Regex::new(r"\t").unwrap();
+    pub static ref RE_TRAIL: Regex = Regex::new(r" +\n").unwrap();
+    pub static ref RE_PERCENT: Regex = Regex::new(r"\\\%").unwrap();
+    pub static ref RE_COMMENT: Regex = Regex::new(r"\%.*").unwrap();
+    pub static ref RE_ITEM: Regex = Regex::new(r".*\\item.*").unwrap();
+    pub static ref RE_DOCUMENT_BEGIN: Regex =
+        Regex::new(r".*\\begin\{document\}.*").unwrap();
+    pub static ref RE_DOCUMENT_END: Regex =
+        Regex::new(r".*\\end\{document\}.*").unwrap();
+    pub static ref RE_ENV_BEGIN: Regex =
+        Regex::new(r".*\\begin\{[a-z\*]*\}.*").unwrap();
+    pub static ref RE_ENV_END: Regex =
+        Regex::new(r".*\\end\{[a-z\*]*\}.*").unwrap();
+    pub static ref RE_LISTS_BEGIN: Vec<Regex> = LISTS
+        .iter()
+        .map(|l| Regex::new(&format!(r".*\\begin\{{{}}}.*", l)).unwrap())
+        .collect();
+    pub static ref RE_LISTS_END: Vec<Regex> = LISTS
+        .iter()
+        .map(|l| Regex::new(&format!(r".*\\end\{{{}}}.*", l)).unwrap())
+        .collect();
 }
diff --git a/src/subs.rs b/src/subs.rs
new file mode 100644
index 0000000..fa77b09
--- /dev/null
+++ b/src/subs.rs
@@ -0,0 +1,20 @@
+use crate::TAB;
+use crate::regexes::*;
+
+pub fn remove_extra_newlines(file: &str) -> String {
+    RE_NEWLINES.replace_all(file, "\n\n").to_string()
+}
+
+pub fn remove_tabs(file: &str) -> String {
+    let replace = (0..TAB).map(|_| " ").collect::<String>();
+    RE_TABS.replace_all(file, replace).to_string()
+}
+
+pub fn remove_trailing_spaces(file: &str) -> String {
+    RE_TRAIL.replace_all(file, "\n").to_string()
+}
+
+pub fn remove_comment(line: &str) -> String {
+    let new_line = RE_PERCENT.replace_all(line, "").to_string();
+    RE_COMMENT.replace_all(&new_line, "").to_string()
+}
diff --git a/src/tests.rs b/src/tests.rs
index f170fe5..8e0eb84 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -37,15 +37,14 @@ mod tests {
 
     #[test]
     fn test_files() {
-        test_file("readme");
-        //let filenames: Vec<String> = fs::read_dir("tests/")
-            //.unwrap()
-            //.map(|f| f.unwrap().file_name().into_string().unwrap())
-            //.filter(|f| f.ends_with("_in.tex"))
-            //.map(|f| f.strip_suffix("_in.tex").unwrap().to_string())
-            //.collect();
-        //for filename in filenames {
-            //test_file(&filename);
-        //}
+        let filenames: Vec<String> = fs::read_dir("tests/")
+            .unwrap()
+            .map(|f| f.unwrap().file_name().into_string().unwrap())
+            .filter(|f| f.ends_with("_in.tex"))
+            .map(|f| f.strip_suffix("_in.tex").unwrap().to_string())
+            .collect();
+        for filename in filenames {
+            test_file(&filename);
+        }
     }
 }
diff --git a/tests/lists_in.tex b/tests/lists_in.tex
new file mode 100644
index 0000000..488c724
--- /dev/null
+++ b/tests/lists_in.tex
@@ -0,0 +1,15 @@
+\begin{itemize}
+
+\item Lists with items on one line
+
+\item Lists with items
+on multiple lines
+
+% TODO comments before a list item
+\item Another item
+
+\item Another item
+% comments inside a list item
+Or even just % trailing comments
+
+\end{itemize}
diff --git a/tests/lists_out.tex b/tests/lists_out.tex
new file mode 100644
index 0000000..d7b51a8
--- /dev/null
+++ b/tests/lists_out.tex
@@ -0,0 +1,15 @@
+\begin{itemize}
+
+  \item Lists with items on one line
+
+  \item Lists with items
+    on multiple lines
+
+    % TODO comments before a list item
+  \item Another item
+
+  \item Another item
+    % comments inside a list item
+    Or even just % trailing comments
+
+\end{itemize}
diff --git a/tests/long_lines_in.tex b/tests/long_lines_in.tex
new file mode 100644
index 0000000..146a050
--- /dev/null
+++ b/tests/long_lines_in.tex
@@ -0,0 +1,34 @@
+TODO
+
+% regular long line
+This line is too long because it has more than eighty characters inside it. Therefore it should be split.
+
+% indented long line
+(
+This line is too long because it has more than eighty characters inside it. Therefore it should be split.
+)
+
+% long line in environment
+\begin{equation}
+1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890
+\end{equation}
+
+% long line in environment only after indenting
+\begin{equation}
+1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 123
+\end{equation}
+
+% long line with no breaks should throw a warning
+\begin{equation}
+123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
+\end{equation}
+
+This line is too long because it has more than eighty characters in it % but only including the comment
+
+This line is too long because it has more than eighty characters inside it % but only just including the comment
+
+This line is too long because it has more than eighty characters inside it. It should be split. % also a comment
+
+This line has a long comment. % This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words
+
+% This line is all a long comment. This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words
diff --git a/tests/long_lines_out.tex b/tests/long_lines_out.tex
new file mode 100644
index 0000000..2457a1b
--- /dev/null
+++ b/tests/long_lines_out.tex
@@ -0,0 +1,34 @@
+TODO
+
+% regular long line
+This line is too long because it has more than eighty characters inside it. Therefore it should be split.
+
+% indented long line
+(
+  This line is too long because it has more than eighty characters inside it. Therefore it should be split.
+)
+
+% long line in environment
+\begin{equation}
+  1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890
+\end{equation}
+
+% long line in environment only after indenting
+\begin{equation}
+  1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 123
+\end{equation}
+
+% long line with no breaks should throw a warning
+\begin{equation}
+  123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
+\end{equation}
+
+This line is too long because it has more than eighty characters in it % but only including the comment
+
+This line is too long because it has more than eighty characters inside it % but only just including the comment
+
+This line is too long because it has more than eighty characters inside it. It should be split. % also a comment
+
+This line has a long comment. % This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words
+
+% This line is all a long comment. This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words
diff --git a/tests/masters_dissertation_in.tex b/tests/masters_dissertation_in.tex
index d123252..2307078 100644
--- a/tests/masters_dissertation_in.tex
+++ b/tests/masters_dissertation_in.tex
@@ -54,7 +54,6 @@
 \usepackage{showframe}
 \usepackage{layout}
 }{}
-%\usepackage[obeyDraft]{todonotes}
 
 % hyperlinks
 \usepackage[plainpages=false,draft=false
diff --git a/tests/masters_dissertation_out.tex b/tests/masters_dissertation_out.tex
index 277255e..cc6a09e 100644
--- a/tests/masters_dissertation_out.tex
+++ b/tests/masters_dissertation_out.tex
@@ -54,7 +54,6 @@
   \usepackage{showframe}
   \usepackage{layout}
 }{}
-%\usepackage[obeyDraft]{todonotes}
 
 % hyperlinks
 \usepackage[plainpages=false,draft=false
diff --git a/tests/phd_dissertation_in.tex b/tests/phd_dissertation_in.tex
index 8919eec..45b05d1 100644
--- a/tests/phd_dissertation_in.tex
+++ b/tests/phd_dissertation_in.tex
@@ -28,7 +28,6 @@
 
 % settings
 \pgfplotsset{compat=1.9}
-\newcommand{\TODO}[1]{\textcolor{red}{\textsc{TODO}: #1}}
 \setcitestyle{round}
 \captionsetup[subfigure]{justification=centering}
 \def\arraystretch{1.3}
diff --git a/tests/phd_dissertation_out.tex b/tests/phd_dissertation_out.tex
index 9fd44a8..4fead58 100644
--- a/tests/phd_dissertation_out.tex
+++ b/tests/phd_dissertation_out.tex
@@ -28,7 +28,6 @@
 
 % settings
 \pgfplotsset{compat=1.9}
-\newcommand{\TODO}[1]{\textcolor{red}{\textsc{TODO}: #1}}
 \setcitestyle{round}
 \captionsetup[subfigure]{justification=centering}
 \def\arraystretch{1.3}
diff --git a/tests/verbatim_in.tex b/tests/verbatim_in.tex
new file mode 100644
index 0000000..282162d
--- /dev/null
+++ b/tests/verbatim_in.tex
@@ -0,0 +1,10 @@
+\documentclass{article}
+\begin{document}
+
+\begin{verbatim}
+
+TODO
+
+\end{verbatim}
+
+\end{document}
diff --git a/tests/verbatim_out.tex b/tests/verbatim_out.tex
new file mode 100644
index 0000000..a4a041f
--- /dev/null
+++ b/tests/verbatim_out.tex
@@ -0,0 +1,10 @@
+\documentclass{article}
+\begin{document}
+
+\begin{verbatim}
+
+  TODO
+
+\end{verbatim}
+
+\end{document}

From b7134aff3765f0a30f1f0520466e4cc8270679f7 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 18:43:24 -0400
Subject: [PATCH 09/13] Added feature to start and end environments on new
 lines

---
 notes.org                       | 10 +---------
 src/indent.rs                   |  5 ++---
 src/main.rs                     |  8 +++++++-
 src/regexes.rs                  |  8 ++++++--
 src/subs.rs                     | 14 +++++++++++++-
 src/tests.rs                    |  3 +++
 tests/environment_lines_in.tex  | 16 ++++++++++++++++
 tests/environment_lines_out.tex | 23 +++++++++++++++++++++++
 8 files changed, 71 insertions(+), 16 deletions(-)
 create mode 100644 tests/environment_lines_in.tex
 create mode 100644 tests/environment_lines_out.tex

diff --git a/notes.org b/notes.org
index 30e01ba..deb3761 100644
--- a/notes.org
+++ b/notes.org
@@ -1,11 +1,7 @@
 #+title: tex-fmt
 * Tests
-** One feature per file
 ** Look for problem cases in other latex documents
 * Features
-** Begin and end document on own line
-** Environments should start and end on new lines
-*** New line before begin/end with regex replace
 ** Fold long lines to 80 characters
 *** Care with trailing comments
 *** No folding in verbatim environments, just warn
@@ -16,16 +12,12 @@
 *** -v Info verbose
 *** -vv Debug verbose
 ** Log file
-** Non-negativity and end-of-file checks apply per indenter type
-*** Care with half-open intervals like [0,1)
-*** Just use ignore comments for these cases
-** Ignore some source lines
+** Ignore source lines
 *** Line-by-line ignore
 *** Block ignore
 * Bugs
 ** Check multiple begins or ends on the same line
 *** This should never happen once new lines are implemented
-*** Still should implement counting for cumulative and back
 ** Better errors including line numbers in source file
 ** Check begins or ends with more brackets on the same line
 * Structure
diff --git a/src/indent.rs b/src/indent.rs
index 0fea4b3..c3ee869 100644
--- a/src/indent.rs
+++ b/src/indent.rs
@@ -1,5 +1,5 @@
-use core::cmp::max;
 use crate::regexes::*;
+use core::cmp::max;
 
 const OPENS: [char; 3] = ['(', '[', '{'];
 const CLOSES: [char; 3] = [')', ']', '}'];
@@ -75,7 +75,6 @@ pub fn get_back(line: &str) -> i8 {
         return 1;
     };
 
-
     let mut back: i8 = 0;
     let mut cumul: i8 = 0;
     for c in line.chars() {
@@ -91,5 +90,5 @@ pub fn get_indent(line: &str, prev_indent: Indent) -> Indent {
     let back = get_back(line);
     let actual = prev_indent.actual + diff;
     let visual: i8 = prev_indent.actual - back;
-    Indent{actual, visual}
+    Indent { actual, visual }
 }
diff --git a/src/main.rs b/src/main.rs
index 576bfc5..bf718c7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -34,13 +34,18 @@ use crate::indent::*;
 fn format_file(file: String, debug: bool) -> String {
     // preformat
     let mut new_file = remove_extra_newlines(&file);
+    new_file = begin_environments_new_line(&new_file);
+    new_file = end_environments_new_line(&new_file);
     new_file = remove_tabs(&new_file);
     new_file = remove_trailing_spaces(&new_file);
     let lines: Vec<&str> = new_file.lines().collect();
 
     // set up variables
     let n_lines = lines.len();
-    let mut indent = Indent{actual: 0, visual: 0};
+    let mut indent = Indent {
+        actual: 0,
+        visual: 0,
+    };
     let mut new_lines = vec!["".to_owned(); n_lines];
 
     // main loop through file
@@ -50,6 +55,7 @@ fn format_file(file: String, debug: bool) -> String {
         let line_strip = &remove_comment(line);
         indent = get_indent(line_strip, indent);
         if !debug {
+            dbg!(&line);
             assert!(indent.actual >= 0, "line {}", i);
             assert!(indent.visual >= 0, "line {}", i);
         };
diff --git a/src/regexes.rs b/src/regexes.rs
index 63a2009..5621f8b 100644
--- a/src/regexes.rs
+++ b/src/regexes.rs
@@ -15,9 +15,9 @@ lazy_static! {
     pub static ref RE_DOCUMENT_END: Regex =
         Regex::new(r".*\\end\{document\}.*").unwrap();
     pub static ref RE_ENV_BEGIN: Regex =
-        Regex::new(r".*\\begin\{[a-z\*]*\}.*").unwrap();
+        Regex::new(r".*\\begin\{[a-zA-Z0-9\*]*\}.*").unwrap();
     pub static ref RE_ENV_END: Regex =
-        Regex::new(r".*\\end\{[a-z\*]*\}.*").unwrap();
+        Regex::new(r".*\\end\{[a-zA-Z0-9\*]*\}.*").unwrap();
     pub static ref RE_LISTS_BEGIN: Vec<Regex> = LISTS
         .iter()
         .map(|l| Regex::new(&format!(r".*\\begin\{{{}}}.*", l)).unwrap())
@@ -26,4 +26,8 @@ lazy_static! {
         .iter()
         .map(|l| Regex::new(&format!(r".*\\end\{{{}}}.*", l)).unwrap())
         .collect();
+    pub static ref RE_ENV_BEGIN_SHARED_LINE: Regex =
+        Regex::new(r"(?P<prev>\S.*?)(?P<env>\\begin)").unwrap();
+    pub static ref RE_ENV_END_SHARED_LINE: Regex =
+        Regex::new(r"(?P<prev>\S.*?)(?P<env>\\end)").unwrap();
 }
diff --git a/src/subs.rs b/src/subs.rs
index fa77b09..7f4bdfe 100644
--- a/src/subs.rs
+++ b/src/subs.rs
@@ -1,5 +1,5 @@
-use crate::TAB;
 use crate::regexes::*;
+use crate::TAB;
 
 pub fn remove_extra_newlines(file: &str) -> String {
     RE_NEWLINES.replace_all(file, "\n\n").to_string()
@@ -14,6 +14,18 @@ pub fn remove_trailing_spaces(file: &str) -> String {
     RE_TRAIL.replace_all(file, "\n").to_string()
 }
 
+pub fn begin_environments_new_line(file: &str) -> String {
+    RE_ENV_BEGIN_SHARED_LINE
+        .replace_all(file, "$prev\n$env")
+        .to_string()
+}
+
+pub fn end_environments_new_line(file: &str) -> String {
+    RE_ENV_END_SHARED_LINE
+        .replace_all(file, "$prev\n$env")
+        .to_string()
+}
+
 pub fn remove_comment(line: &str) -> String {
     let new_line = RE_PERCENT.replace_all(line, "").to_string();
     RE_COMMENT.replace_all(&new_line, "").to_string()
diff --git a/src/tests.rs b/src/tests.rs
index 8e0eb84..c307e41 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -37,6 +37,8 @@ mod tests {
 
     #[test]
     fn test_files() {
+        test_file("environment_lines");
+        /*
         let filenames: Vec<String> = fs::read_dir("tests/")
             .unwrap()
             .map(|f| f.unwrap().file_name().into_string().unwrap())
@@ -46,5 +48,6 @@ mod tests {
         for filename in filenames {
             test_file(&filename);
         }
+        */
     }
 }
diff --git a/tests/environment_lines_in.tex b/tests/environment_lines_in.tex
new file mode 100644
index 0000000..07f631e
--- /dev/null
+++ b/tests/environment_lines_in.tex
@@ -0,0 +1,16 @@
+% environments on separate lines
+\begin{env1}
+\begin{env2}
+\end{env2}
+\end{env1}
+
+% environments on shared lines
+\begin{env1}\begin{env2}
+\end{env2}\end{env1}
+
+% environments on shared lines with spaces
+\begin{env1} \begin{env2}
+\end{env2} \end{env1}
+
+% environments all on same line
+\begin{env1}\begin{env2}\end{env2}\end{env1}
diff --git a/tests/environment_lines_out.tex b/tests/environment_lines_out.tex
new file mode 100644
index 0000000..c300821
--- /dev/null
+++ b/tests/environment_lines_out.tex
@@ -0,0 +1,23 @@
+% environments on separate lines
+\begin{env1}
+  \begin{env2}
+  \end{env2}
+\end{env1}
+
+% environments on shared lines
+\begin{env1}
+  \begin{env2}
+  \end{env2}
+\end{env1}
+
+% environments on shared lines with spaces
+\begin{env1}
+  \begin{env2}
+  \end{env2}
+\end{env1}
+
+% environments all on same line
+\begin{env1}
+  \begin{env2}
+  \end{env2}
+\end{env1}

From 9e4388a049aa149c41c0e0db4f67ab48cf99cb9e Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 18:46:00 -0400
Subject: [PATCH 10/13] Move format function into module

---
 src/format.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++
 src/main.rs   | 61 ++++-----------------------------------------------
 2 files changed, 58 insertions(+), 57 deletions(-)
 create mode 100644 src/format.rs

diff --git a/src/format.rs b/src/format.rs
new file mode 100644
index 0000000..cae81c6
--- /dev/null
+++ b/src/format.rs
@@ -0,0 +1,54 @@
+use crate::indent::*;
+use crate::subs::*;
+use crate::TAB;
+
+pub fn format_file(file: String, debug: bool) -> String {
+    // preformat
+    let mut new_file = remove_extra_newlines(&file);
+    new_file = begin_environments_new_line(&new_file);
+    new_file = end_environments_new_line(&new_file);
+    new_file = remove_tabs(&new_file);
+    new_file = remove_trailing_spaces(&new_file);
+    let lines: Vec<&str> = new_file.lines().collect();
+
+    // set up variables
+    let n_lines = lines.len();
+    let mut indent = Indent {
+        actual: 0,
+        visual: 0,
+    };
+    let mut new_lines = vec!["".to_owned(); n_lines];
+
+    // main loop through file
+    for i in 0..n_lines {
+        // calculate indent
+        let line = lines[i];
+        let line_strip = &remove_comment(line);
+        indent = get_indent(line_strip, indent);
+        if !debug {
+            dbg!(&line);
+            assert!(indent.actual >= 0, "line {}", i);
+            assert!(indent.visual >= 0, "line {}", i);
+        };
+
+        // apply indent
+        let mut new_line = line.trim_start().to_string();
+        if !new_line.is_empty() {
+            let n_spaces = indent.visual * TAB;
+            let spaces: String = (0..n_spaces).map(|_| " ").collect();
+            new_line.insert_str(0, &spaces);
+        }
+        new_lines[i] = new_line
+    }
+
+    // check indents return to zero
+    if !debug {
+        assert!(indent.actual == 0);
+        assert!(indent.visual == 0);
+    }
+
+    // prepare indented file
+    let mut new_file = new_lines.join("\n");
+    new_file.push('\n');
+    new_file
+}
diff --git a/src/main.rs b/src/main.rs
index bf718c7..50f8078 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -23,64 +23,11 @@ struct Cli {
     filenames: Vec<String>,
 }
 
-pub mod regexes;
-
-mod subs;
-use crate::subs::*;
-
+mod format;
 mod indent;
-use crate::indent::*;
-
-fn format_file(file: String, debug: bool) -> String {
-    // preformat
-    let mut new_file = remove_extra_newlines(&file);
-    new_file = begin_environments_new_line(&new_file);
-    new_file = end_environments_new_line(&new_file);
-    new_file = remove_tabs(&new_file);
-    new_file = remove_trailing_spaces(&new_file);
-    let lines: Vec<&str> = new_file.lines().collect();
-
-    // set up variables
-    let n_lines = lines.len();
-    let mut indent = Indent {
-        actual: 0,
-        visual: 0,
-    };
-    let mut new_lines = vec!["".to_owned(); n_lines];
-
-    // main loop through file
-    for i in 0..n_lines {
-        // calculate indent
-        let line = lines[i];
-        let line_strip = &remove_comment(line);
-        indent = get_indent(line_strip, indent);
-        if !debug {
-            dbg!(&line);
-            assert!(indent.actual >= 0, "line {}", i);
-            assert!(indent.visual >= 0, "line {}", i);
-        };
-
-        // apply indent
-        let mut new_line = line.trim_start().to_string();
-        if !new_line.is_empty() {
-            let n_spaces = indent.visual * TAB;
-            let spaces: String = (0..n_spaces).map(|_| " ").collect();
-            new_line.insert_str(0, &spaces);
-        }
-        new_lines[i] = new_line
-    }
-
-    // check indents return to zero
-    if !debug {
-        assert!(indent.actual == 0);
-        assert!(indent.visual == 0);
-    }
-
-    // prepare indented file
-    let mut new_file = new_lines.join("\n");
-    new_file.push('\n');
-    new_file
-}
+mod regexes;
+mod subs;
+use crate::format::*;
 
 #[cfg(test)]
 mod tests;

From dbbe4f6961700ab5cbfccc61bda6112f5164c10d Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 18:47:53 -0400
Subject: [PATCH 11/13] Formatting README

---
 README.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3fecdb6..c937e3c 100644
--- a/README.md
+++ b/README.md
@@ -12,16 +12,10 @@ A LaTeX formatter written in Rust.
 <table width="100%">
 <tr>
 <td>
-
-``` shell
-example.tex
-```
+Input
 </td>
 <td>
-
-``` shell
-tex-fmt example.tex
-```
+Output
 </td>
 </tr>
 <tr>

From ff14461851d7fa16bce126a1928d0873a7716a13 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 20:23:21 -0400
Subject: [PATCH 12/13] Working on line breaking for environments

---
 notes.org                       |  3 +++
 src/format.rs                   |  3 +--
 src/subs.rs                     | 35 +++++++++++++++++++++++----------
 src/tests.rs                    |  3 ---
 tests/environment_lines_in.tex  | 20 ++++++++++---------
 tests/environment_lines_out.tex | 27 +++++++++++--------------
 6 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/notes.org b/notes.org
index deb3761..ec05a81 100644
--- a/notes.org
+++ b/notes.org
@@ -2,6 +2,9 @@
 * Tests
 ** Look for problem cases in other latex documents
 * Features
+** TODO Environments should start and end on new lines
+*** New line before begin/end with regex replace
+*** Care with comments
 ** Fold long lines to 80 characters
 *** Care with trailing comments
 *** No folding in verbatim environments, just warn
diff --git a/src/format.rs b/src/format.rs
index cae81c6..a4a3952 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -5,8 +5,7 @@ use crate::TAB;
 pub fn format_file(file: String, debug: bool) -> String {
     // preformat
     let mut new_file = remove_extra_newlines(&file);
-    new_file = begin_environments_new_line(&new_file);
-    new_file = end_environments_new_line(&new_file);
+    //new_file = begin_end_environments_new_line(&new_file);
     new_file = remove_tabs(&new_file);
     new_file = remove_trailing_spaces(&new_file);
     let lines: Vec<&str> = new_file.lines().collect();
diff --git a/src/subs.rs b/src/subs.rs
index 7f4bdfe..a847460 100644
--- a/src/subs.rs
+++ b/src/subs.rs
@@ -14,17 +14,32 @@ pub fn remove_trailing_spaces(file: &str) -> String {
     RE_TRAIL.replace_all(file, "\n").to_string()
 }
 
-pub fn begin_environments_new_line(file: &str) -> String {
-    RE_ENV_BEGIN_SHARED_LINE
-        .replace_all(file, "$prev\n$env")
-        .to_string()
-}
+// TODO
+//pub fn begin_end_environments_new_line(file: &str) -> String {
+    //file
+        //.lines()
+        //.map(|l| remove_comment(l))
+        //.map(|l|
+             //RE_ENV_BEGIN_SHARED_LINE
+             //.replace_all(&l, "$prev\n$env")
+             //.to_string())
+        //.map(|l|
+             //RE_ENV_END_SHARED_LINE
+             //.replace_all(&l, "$prev\n$env")
+             //.to_string())
+        //.fold(String::new(), |a, b| a + &b + "\n")
 
-pub fn end_environments_new_line(file: &str) -> String {
-    RE_ENV_END_SHARED_LINE
-        .replace_all(file, "$prev\n$env")
-        .to_string()
-}
+    //let lines: Vec<&str> = new_file.lines().collect();
+    //let n_lines = lines.len();
+    //let mut new_lines = vec![];
+    //for i in 0..n_lines {
+        //let line = lines[i];
+        //if RE_ENV_BEGIN_SHARED_LINE
+    //}
+    //RE_ENV_BEGIN_SHARED_LINE
+        //.replace_all(file, "$prev\n$env")
+        //.to_string()
+//}
 
 pub fn remove_comment(line: &str) -> String {
     let new_line = RE_PERCENT.replace_all(line, "").to_string();
diff --git a/src/tests.rs b/src/tests.rs
index c307e41..8e0eb84 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -37,8 +37,6 @@ mod tests {
 
     #[test]
     fn test_files() {
-        test_file("environment_lines");
-        /*
         let filenames: Vec<String> = fs::read_dir("tests/")
             .unwrap()
             .map(|f| f.unwrap().file_name().into_string().unwrap())
@@ -48,6 +46,5 @@ mod tests {
         for filename in filenames {
             test_file(&filename);
         }
-        */
     }
 }
diff --git a/tests/environment_lines_in.tex b/tests/environment_lines_in.tex
index 07f631e..12f11d7 100644
--- a/tests/environment_lines_in.tex
+++ b/tests/environment_lines_in.tex
@@ -1,16 +1,18 @@
+TODO
+
 % environments on separate lines
-\begin{env1}
-\begin{env2}
-\end{env2}
-\end{env1}
+%\begin{env1}
+%\begin{env2}
+%\end{env2}
+%\end{env1}
 
 % environments on shared lines
-\begin{env1}\begin{env2}
-\end{env2}\end{env1}
+%\begin{env1}\begin{env2}
+%\end{env2}\end{env1}
 
 % environments on shared lines with spaces
-\begin{env1} \begin{env2}
-\end{env2} \end{env1}
+%\begin{env1} \begin{env2}
+%\end{env2} \end{env1}
 
 % environments all on same line
-\begin{env1}\begin{env2}\end{env2}\end{env1}
+%\begin{env1}\begin{env2}\end{env2}\end{env1}
diff --git a/tests/environment_lines_out.tex b/tests/environment_lines_out.tex
index c300821..12f11d7 100644
--- a/tests/environment_lines_out.tex
+++ b/tests/environment_lines_out.tex
@@ -1,23 +1,18 @@
+TODO
+
 % environments on separate lines
-\begin{env1}
-  \begin{env2}
-  \end{env2}
-\end{env1}
+%\begin{env1}
+%\begin{env2}
+%\end{env2}
+%\end{env1}
 
 % environments on shared lines
-\begin{env1}
-  \begin{env2}
-  \end{env2}
-\end{env1}
+%\begin{env1}\begin{env2}
+%\end{env2}\end{env1}
 
 % environments on shared lines with spaces
-\begin{env1}
-  \begin{env2}
-  \end{env2}
-\end{env1}
+%\begin{env1} \begin{env2}
+%\end{env2} \end{env1}
 
 % environments all on same line
-\begin{env1}
-  \begin{env2}
-  \end{env2}
-\end{env1}
+%\begin{env1}\begin{env2}\end{env2}\end{env1}

From 77a125768fccf1940942f2ce0ddd5da6f08e8195 Mon Sep 17 00:00:00 2001
From: William G Underwood <42812654+WGUNDERWOOD@users.noreply.github.com>
Date: Wed, 1 May 2024 20:24:38 -0400
Subject: [PATCH 13/13] README table format

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c937e3c..6fde5f5 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,10 @@ A LaTeX formatter written in Rust.
 <table width="100%">
 <tr>
 <td>
-Input
+<b>Input</b>
 </td>
 <td>
-Output
+<b>Output</b>
 </td>
 </tr>
 <tr>