[proofplan]
We decompose the estimation error into a deterministic bias term and a centred random fluctuation. The bias converges to zero because the rescaled kernels average $f$ over a shrinking neighbourhood of the continuity point $x$. The stochastic term converges to zero in probability by a variance estimate of order $(n h_n)^{-1}$, followed by [Chebyshev's inequality](/theorems/1126). Combining these two convergences gives pointwise convergence in probability.
[/proofplan]
[step:Fix the probability and measure notation]
Throughout the proof, $(\Omega,\mathcal F,\mathbb P)$ denotes the probability space on which the random variables are defined, and $\mathcal L^1$ denotes one-dimensional [Lebesgue measure](/page/Lebesgue%20Measure) on $(\mathbb R,\mathcal B(\mathbb R))$. We use $K$ to denote the bounded compactly supported Borel representative appearing in the estimator, and $\operatorname{supp}K$ denotes its ordinary closed support.
[/step]
[step:Compute the expectation and show the bias vanishes]
For each $n\in\mathbb N$ and $i\in\{1,\dots,n\}$, define the [random variable](/page/Random%20Variable) $Y_{n,i}:\Omega\to\mathbb R$ by
\begin{align*}
Y_{n,i}(\omega)=\frac{1}{h_n}K\left(\frac{x-X_i(\omega)}{h_n}\right).
\end{align*}
Then
\begin{align*}
\hat f_{n,h_n}(x)=\frac{1}{n}\sum_{i=1}^n Y_{n,i}.
\end{align*}
Since $K$ is bounded and compactly supported, $K\in L^1(\mathbb R)$, and the kernel normalization hypothesis gives
\begin{align*}
\int_{\mathbb R}K(u)\,d\mathcal L^1(u)=1.
\end{align*}
For fixed $n$ and $i$, the map $y\mapsto h_n^{-1}K((x-y)/h_n)$ is bounded and supported in the compact set $x-h_n\operatorname{supp}K$. Since $f\in L^1(\mathbb R)$, the product $y\mapsto h_n^{-1}K((x-y)/h_n)f(y)$ is integrable with respect to $\mathcal L^1$.
Because $X_i$ has density $f$ with respect to $\mathcal L^1$,
\begin{align*}
\mathbb E[Y_{n,i}]=\int_{\mathbb R}\frac{1}{h_n}K\left(\frac{x-y}{h_n}\right)f(y)\,d\mathcal L^1(y).
\end{align*}
Use the change of variables formula with $u=(x-y)/h_n$, equivalently $y=x-h_nu$, under which $d\mathcal L^1(y)=h_n\,d\mathcal L^1(u)$. The domain $\mathbb R$ is mapped onto $\mathbb R$, and the integrability verified above justifies the substitution. Since the $Y_{n,i}$ have the same expectation, $\mathbb E[\hat f_{n,h_n}(x)]=\mathbb E[Y_{n,1}]$, and hence
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]=\int_{\mathbb R}K(u)f(x-h_nu)\,d\mathcal L^1(u).
\end{align*}
Therefore, using $\int_{\mathbb R}K(u)\,d\mathcal L^1(u)=1$,
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]-f(x)
=\int_{\mathbb R}K(u)\bigl(f(x-h_nu)-f(x)\bigr)\,d\mathcal L^1(u).
\end{align*}
Let $R>0$ be such that $\operatorname{supp}K\subset[-R,R]$. Since $f$ is continuous at $x$, for every $\varepsilon>0$ there exists $\delta>0$ such that $|f(z)-f(x)|<\varepsilon/(1+\|K\|_{L^1})$ whenever $|z-x|<\delta$. Choose $N\in\mathbb N$ such that $h_nR<\delta$ for all $n\ge N$. Then for $n\ge N$ and $u\in\operatorname{supp}K$,
\begin{align*}
|x-h_nu-x|=h_n|u|\le h_nR<\delta.
\end{align*}
Hence the triangle inequality for the [Lebesgue integral](/page/Lebesgue%20Integral) gives
\begin{align*}
\left|\mathbb E[\hat f_{n,h_n}(x)]-f(x)\right|\le \int_{\mathbb R}|K(u)|\,|f(x-h_nu)-f(x)|\,d\mathcal L^1(u).
\end{align*}
The continuity bound on $\operatorname{supp}K$ then gives
\begin{align*}
\int_{\mathbb R}|K(u)|\,|f(x-h_nu)-f(x)|\,d\mathcal L^1(u)\le \frac{\varepsilon}{1+\|K\|_{L^1}}\int_{\mathbb R}|K(u)|\,d\mathcal L^1(u).
\end{align*}
Since $\int_{\mathbb R}|K(u)|\,d\mathcal L^1(u)=\|K\|_{L^1}$, this upper bound is at most $\varepsilon$.
Thus
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]\to f(x).
\end{align*}
[guided]
The expectation is the deterministic part of the estimator, so we compute it first. For each $n\in\mathbb N$ and $i\in\{1,\dots,n\}$, define the random variable $Y_{n,i}:\Omega\to\mathbb R$ by
\begin{align*}
Y_{n,i}(\omega)=\frac{1}{h_n}K\left(\frac{x-X_i(\omega)}{h_n}\right).
\end{align*}
Then
\begin{align*}
\hat f_{n,h_n}(x)=\frac{1}{n}\sum_{i=1}^n Y_{n,i}.
\end{align*}
Since all $X_i$ have the same density $f$, all $Y_{n,i}$ have the same expectation. Before using the density formula, we check integrability. For fixed $n$ and $i$, the map $y\mapsto h_n^{-1}K((x-y)/h_n)$ is bounded and supported in the compact set $x-h_n\operatorname{supp}K$. Since $f\in L^1(\mathbb R)$, the product $y\mapsto h_n^{-1}K((x-y)/h_n)f(y)$ is integrable with respect to $\mathcal L^1$. The density assumption therefore gives
\begin{align*}
\mathbb E[Y_{n,i}]
=\int_{\mathbb R}\frac{1}{h_n}K\left(\frac{x-y}{h_n}\right)f(y)\,d\mathcal L^1(y).
\end{align*}
Now apply the change of variables formula with the substitution $u=(x-y)/h_n$, so $y=x-h_nu$. Because $h_n>0$, this affine change of variables maps $\mathbb R$ bijectively onto $\mathbb R$, and the one-dimensional Lebesgue measure transforms as $d\mathcal L^1(y)=h_n\,d\mathcal L^1(u)$. The integrability checked in the preceding paragraph justifies applying the formula. Therefore
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]=\mathbb E[Y_{n,1}]=\int_{\mathbb R}K(u)f(x-h_nu)\,d\mathcal L^1(u).
\end{align*}
Subtracting $f(x)$ is useful because the kernel has total mass one by hypothesis:
\begin{align*}
f(x)=f(x)\int_{\mathbb R}K(u)\,d\mathcal L^1(u).
\end{align*}
Thus
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]-f(x)
=\int_{\mathbb R}K(u)\bigl(f(x-h_nu)-f(x)\bigr)\,d\mathcal L^1(u).
\end{align*}
The compact support of $K$ is what lets us use only continuity of $f$ at the single point $x$. Choose $R>0$ such that $\operatorname{supp}K\subset[-R,R]$. Since $f$ is continuous at $x$, for every $\varepsilon>0$ there exists $\delta>0$ such that
\begin{align*}
|f(z)-f(x)|<\frac{\varepsilon}{1+\|K\|_{L^1}}
\end{align*}
whenever $|z-x|<\delta$. Since $h_n\to0$, choose $N\in\mathbb N$ such that $h_nR<\delta$ for all $n\ge N$. For $u\in\operatorname{supp}K$ and $n\ge N$,
\begin{align*}
|x-h_nu-x|=h_n|u|\le h_nR<\delta,
\end{align*}
so the continuity estimate applies. Hence
\begin{align*}
\left|\mathbb E[\hat f_{n,h_n}(x)]-f(x)\right|\le \int_{\mathbb R}|K(u)|\,|f(x-h_nu)-f(x)|\,d\mathcal L^1(u)\le \frac{\varepsilon}{1+\|K\|_{L^1}}\int_{\mathbb R}|K(u)|\,d\mathcal L^1(u)\le \varepsilon.
\end{align*}
This proves
\begin{align*}
\mathbb E[\hat f_{n,h_n}(x)]\to f(x).
\end{align*}
[/guided]
[/step]
[step:Bound the variance by a multiple of $(n h_n)^{-1}$]
Because $X_1,\dots,X_n$ are independent, the random variables $Y_{n,1},\dots,Y_{n,n}$ are independent. Hence
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)=\operatorname{Var}\left(\frac{1}{n}\sum_{i=1}^nY_{n,i}\right)=\frac{1}{n^2}\sum_{i=1}^n\operatorname{Var}(Y_{n,i})=\frac{1}{n}\operatorname{Var}(Y_{n,1}).
\end{align*}
Since $\operatorname{Var}(Y_{n,1})\le \mathbb E[Y_{n,1}^2]$,
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{1}{n}\int_{\mathbb R}\frac{1}{h_n^2}K\left(\frac{x-y}{h_n}\right)^2f(y)\,d\mathcal L^1(y).
\end{align*}
Using again the change of variables formula with the substitution $u=(x-y)/h_n$, equivalently $y=x-h_nu$, and with $d\mathcal L^1(y)=h_n\,d\mathcal L^1(u)$, gives
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{1}{n h_n}\int_{\mathbb R}K(u)^2f(x-h_nu)\,d\mathcal L^1(u).
\end{align*}
Since $f$ is continuous at $x$, there exists $\delta_0>0$ such that $|f(z)-f(x)|<1$ whenever $|z-x|<\delta_0$. Define $M:=|f(x)|+1$. Then $f(z)\le |f(z)|\le M$ whenever $|z-x|<\delta_0$. Choose $N_0\in\mathbb N$ such that $h_nR<\delta_0$ for all $n\ge N_0$. For $n\ge N_0$ and $u\in\operatorname{supp}K$, we have $f(x-h_nu)\le M$. Because $K$ is bounded and compactly supported, the constant
\begin{align*}
C_K:=\int_{\mathbb R}K(u)^2\,d\mathcal L^1(u)
\end{align*}
is finite. Therefore
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{M C_K}{n h_n}.
\end{align*}
Since $n h_n\to\infty$,
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\to0.
\end{align*}
[guided]
The random fluctuation is controlled by its variance. Because $X_1,\dots,X_n$ are independent and each $Y_{n,i}$ is a measurable function of $X_i$, the random variables $Y_{n,1},\dots,Y_{n,n}$ are independent. Therefore the variance of the average is the sum of the variances divided by $n^2$:
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)=\operatorname{Var}\left(\frac{1}{n}\sum_{i=1}^nY_{n,i}\right)=\frac{1}{n^2}\sum_{i=1}^n\operatorname{Var}(Y_{n,i})=\frac{1}{n}\operatorname{Var}(Y_{n,1}).
\end{align*}
The inequality $\operatorname{Var}(Y_{n,1})\le \mathbb E[Y_{n,1}^2]$ reduces the problem to a second-moment estimate. Since $X_1$ has density $f$ with respect to $\mathcal L^1$,
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{1}{n}\int_{\mathbb R}\frac{1}{h_n^2}K\left(\frac{x-y}{h_n}\right)^2f(y)\,d\mathcal L^1(y).
\end{align*}
Apply the change of variables formula with $u=(x-y)/h_n$, equivalently $y=x-h_nu$. The map sends $\mathbb R$ onto $\mathbb R$, and $d\mathcal L^1(y)=h_n\,d\mathcal L^1(u)$. Hence
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{1}{n h_n}\int_{\mathbb R}K(u)^2f(x-h_nu)\,d\mathcal L^1(u).
\end{align*}
Now use continuity of $f$ only near the fixed point $x$. There exists $\delta_0>0$ such that $|f(z)-f(x)|<1$ whenever $|z-x|<\delta_0$. Define $M:=|f(x)|+1$. Then $f(z)\le |f(z)|\le M$ whenever $|z-x|<\delta_0$. If $R>0$ satisfies $\operatorname{supp}K\subset[-R,R]$, choose $N_0\in\mathbb N$ such that $h_nR<\delta_0$ for all $n\ge N_0$. For $n\ge N_0$ and $u\in\operatorname{supp}K$, this gives $f(x-h_nu)\le M$. Since $K$ is bounded and compactly supported, the constant
\begin{align*}
C_K:=\int_{\mathbb R}K(u)^2\,d\mathcal L^1(u)
\end{align*}
is finite. Therefore
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\le \frac{M C_K}{n h_n}.
\end{align*}
Since $n h_n\to\infty$, the right-hand side tends to $0$, and so
\begin{align*}
\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)\to0.
\end{align*}
[/guided]
[/step]
[step:Use the variance estimate to control the centred estimator]
Define the random variable $Z_n:\Omega\to\mathbb R$ by
\begin{align*}
Z_n(\omega)=\hat f_{n,h_n}(x)(\omega)-\mathbb E[\hat f_{n,h_n}(x)].
\end{align*}
Then $\mathbb E[Z_n]=0$ and
\begin{align*}
\mathbb E[Z_n^2]=\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr).
\end{align*}
For every $\varepsilon>0$, the pointwise inequality underlying [Chebyshev's inequality](/theorems/1126),
\begin{align*}
\varepsilon^2\mathbb 1_{\{|Z_n|\ge\varepsilon\}}\le Z_n^2,
\end{align*}
implies, after integrating with respect to $\mathbb P$,
\begin{align*}
\mathbb P(|Z_n|\ge\varepsilon)
\le \frac{\mathbb E[Z_n^2]}{\varepsilon^2}
=\frac{\operatorname{Var}\bigl(\hat f_{n,h_n}(x)\bigr)}{\varepsilon^2}.
\end{align*}
The variance estimate from the previous step gives
\begin{align*}
\mathbb P(|Z_n|\ge\varepsilon)\to0.
\end{align*}
Thus
\begin{align*}
\hat f_{n,h_n}(x)-\mathbb E[\hat f_{n,h_n}(x)]\xrightarrow{\mathbb P}0.
\end{align*}
[/step]
[step:Combine the deterministic and stochastic terms]
For every $n\in\mathbb N$,
\begin{align*}
\hat f_{n,h_n}(x)-f(x)
=
\left(\hat f_{n,h_n}(x)-\mathbb E[\hat f_{n,h_n}(x)]\right)
+
\left(\mathbb E[\hat f_{n,h_n}(x)]-f(x)\right).
\end{align*}
Let $\varepsilon>0$. By the bias convergence, choose $N_1\in\mathbb N$ such that
\begin{align*}
\left|\mathbb E[\hat f_{n,h_n}(x)]-f(x)\right|<\frac{\varepsilon}{2}
\end{align*}
for all $n\ge N_1$. Then for $n\ge N_1$,
\begin{align*}
\left\{\left|\hat f_{n,h_n}(x)-f(x)\right|>\varepsilon\right\}
\subset
\left\{\left|\hat f_{n,h_n}(x)-\mathbb E[\hat f_{n,h_n}(x)]\right|>\frac{\varepsilon}{2}\right\}.
\end{align*}
Taking probabilities and using the centred convergence from the previous step,
\begin{align*}
\mathbb P\left(\left|\hat f_{n,h_n}(x)-f(x)\right|>\varepsilon\right)\to0.
\end{align*}
Since this holds for every $\varepsilon>0$,
\begin{align*}
\hat f_{n,h_n}(x)\xrightarrow{\mathbb P}f(x).
\end{align*}
[/step]