[proofplan]
The estimator error is decomposed as the inverse sample second-moment matrix multiplied by the sample average of the orthogonality moments $X_i u_i$. We first prove the needed [weak law of large numbers](/theorems/1127) for integrable finite-dimensional random vectors, then apply it to $X_iX_i^\top$ and $X_i u_i$. Since the population matrix $Q$ is positive definite, matrices sufficiently close to $Q$ are invertible and their inverses are uniformly bounded. The product representation then implies that $\hat{\beta}_n-\beta_0$ converges to $0$ in probability, while the event on which the sample matrix is singular has probability tending to $0$.
[/proofplan]
custom_env
admin
[step:Prove the weak law for integrable finite-dimensional random vectors]
Let $d \in \mathbb{N}$ and let $Z_1,Z_2,\dots : \Omega \to \mathbb{R}^d$ be i.i.d. random vectors with $\mathbb{E}[|Z_1|] < \infty$. We claim that
\begin{align*}
\frac{1}{n}\sum_{i=1}^n Z_i \xrightarrow{\mathbb{P}} \mathbb{E}[Z_1].
\end{align*}
For $M > 0$, define the truncated random vector $Z_i^{(M)} : \Omega \to \mathbb{R}^d$ by
\begin{align*}
Z_i^{(M)} := Z_i\,\mathbb{1}_{\{|Z_i|\le M\}}.
\end{align*}
Then $|Z_i^{(M)}| \le M$ and
\begin{align*}
\mathbb{E}\left[\left|Z_1 - Z_1^{(M)}\right|\right]
=
\mathbb{E}\left[|Z_1|\mathbb{1}_{\{|Z_1|>M\}}\right]
\to 0
\end{align*}
as $M \to \infty$, by integrability of $|Z_1|$.
Fix $\varepsilon > 0$. Choose $M > 0$ such that
\begin{align*}
\mathbb{E}\left[\left|Z_1 - Z_1^{(M)}\right|\right] < \frac{\varepsilon^2}{12}.
\end{align*}
By Markov's inequality applied to the non-negative random variable
\begin{align*}
\left|\frac{1}{n}\sum_{i=1}^n (Z_i-Z_i^{(M)})\right|,
\end{align*}
we get
\begin{align*}
\mathbb{P}\left(
\left|\frac{1}{n}\sum_{i=1}^n (Z_i-Z_i^{(M)})\right|>\frac{\varepsilon}{3}
\right)
\le
\frac{3}{\varepsilon}\mathbb{E}\left[\left|Z_1-Z_1^{(M)}\right|\right].
\end{align*}
Also
\begin{align*}
\left|\mathbb{E}[Z_1-Z_1^{(M)}]\right|
\le
\mathbb{E}\left[\left|Z_1-Z_1^{(M)}\right|\right].
\end{align*}
For the bounded centered average, independence gives
\begin{align*}
\mathbb{E}\left[
\left|
\frac{1}{n}\sum_{i=1}^n \left(Z_i^{(M)}-\mathbb{E}[Z_1^{(M)}]\right)
\right|^2
\right]
=
\frac{1}{n}\mathbb{E}\left[\left|Z_1^{(M)}-\mathbb{E}[Z_1^{(M)}]\right|^2\right]
\le
\frac{4M^2}{n}.
\end{align*}
Therefore [Chebyshev's inequality](/theorems/1126) gives
\begin{align*}
\mathbb{P}\left(
\left|
\frac{1}{n}\sum_{i=1}^n \left(Z_i^{(M)}-\mathbb{E}[Z_1^{(M)}]\right)
\right|>\frac{\varepsilon}{3}
\right)
\le
\frac{36M^2}{n\varepsilon^2}.
\end{align*}
Combining the three estimates by the triangle inequality, we obtain
\begin{align*}
\limsup_{n\to\infty}
\mathbb{P}\left(
\left|
\frac{1}{n}\sum_{i=1}^n Z_i-\mathbb{E}[Z_1]
\right|>\varepsilon
\right)
\le
\frac{3}{\varepsilon}\mathbb{E}\left[\left|Z_1-Z_1^{(M)}\right|\right].
\end{align*}
The chosen $M$ makes the right-hand side arbitrarily small by sending $M \to \infty$. Hence the claimed convergence in probability holds.
[/step]
custom_env
admin
[step:Apply the weak law to the sample second moment and orthogonality moment]Define matrix-valued random variables $A_i : \Omega \to \mathbb{R}^{p\times p}$ and vector-valued random variables $b_i : \Omega \to \mathbb{R}^p$ by
\begin{align*}
A_i := X_iX_i^\top,\qquad b_i := X_i u_i.
\end{align*}
Equip $\mathbb{R}^{p\times p}$ with the Frobenius norm $|A|_F := \left(\sum_{j,k=1}^p A_{jk}^2\right)^{1/2}$. Since
\begin{align*}
|A_i|_F = |X_i|^2,
\end{align*}
the hypothesis $\mathbb{E}[|X_i|^2]<\infty$ gives $\mathbb{E}[|A_i|_F]<\infty$.
Also
\begin{align*}
u_i = Y_i-X_i^\top\beta_0,
\end{align*}
so
\begin{align*}
\mathbb{E}[|u_i|^2]
\le
2\mathbb{E}[|Y_i|^2]+2|\beta_0|^2\mathbb{E}[|X_i|^2]
<\infty.
\end{align*}
By the [Cauchy-Schwarz inequality](/theorems/432) applied to the real random variables $|X_i|$ and $|u_i|$,
\begin{align*}
\mathbb{E}[|b_i|]
=
\mathbb{E}[|X_i||u_i|]
\le
\left(\mathbb{E}[|X_i|^2]\right)^{1/2}
\left(\mathbb{E}[|u_i|^2]\right)^{1/2}
<\infty.
\end{align*}
The weak law from the previous step therefore applies to $(A_i)_{i=1}^\infty$ and $(b_i)_{i=1}^\infty$, yielding
\begin{align*}
S_n=\frac{1}{n}\sum_{i=1}^n A_i \xrightarrow{\mathbb{P}} Q,
\qquad
m_n:=\frac{1}{n}\sum_{i=1}^n b_i \xrightarrow{\mathbb{P}} \mathbb{E}[X_i u_i]=0.
\end{align*}[/step]
custom_env
admin
[guided]The sample quantities we need are averages of i.i.d. finite-dimensional random vectors, so the previous step is designed exactly for them. For the matrix average, we view a $p\times p$ matrix as a vector in $\mathbb{R}^{p^2}$ and use the Frobenius norm
\begin{align*}
|A|_F := \left(\sum_{j,k=1}^p A_{jk}^2\right)^{1/2}.
\end{align*}
For $A_i := X_iX_i^\top$, the entries are $(A_i)_{jk}=(X_i)_j(X_i)_k$, and a direct computation gives
\begin{align*}
|A_i|_F^2
=
\sum_{j,k=1}^p (X_i)_j^2(X_i)_k^2
=
\left(\sum_{j=1}^p (X_i)_j^2\right)^2
=
|X_i|^4.
\end{align*}
Thus $|A_i|_F=|X_i|^2$, and the assumption $\mathbb{E}[|X_i|^2]<\infty$ is exactly the integrability hypothesis needed for the weak law.
For the moment vector $b_i := X_i u_i$, we must first check integrability. Since $u_i=Y_i-X_i^\top\beta_0$, the elementary inequality $|a-b|^2\le 2|a|^2+2|b|^2$ gives
\begin{align*}
\mathbb{E}[|u_i|^2]
\le
2\mathbb{E}[|Y_i|^2]+2\mathbb{E}[|X_i^\top\beta_0|^2]
\le
2\mathbb{E}[|Y_i|^2]+2|\beta_0|^2\mathbb{E}[|X_i|^2]
<\infty.
\end{align*}
Applying Cauchy-Schwarz to $|X_i|$ and $|u_i|$ gives
\begin{align*}
\mathbb{E}[|X_i u_i|]
=
\mathbb{E}[|X_i||u_i|]
\le
\left(\mathbb{E}[|X_i|^2]\right)^{1/2}
\left(\mathbb{E}[|u_i|^2]\right)^{1/2}
<\infty.
\end{align*}
Therefore the weak law applies and yields
\begin{align*}
\frac{1}{n}\sum_{i=1}^n X_iX_i^\top \xrightarrow{\mathbb{P}} \mathbb{E}[X_iX_i^\top]=Q,
\qquad
\frac{1}{n}\sum_{i=1}^n X_i u_i \xrightarrow{\mathbb{P}} \mathbb{E}[X_i u_i]=0.
\end{align*}[/guided]
custom_env
admin
[step:Control inversion near the positive definite population matrix]Let $|B|_{\mathrm{op}}$ denote the operator norm of a matrix $B \in \mathbb{R}^{p\times p}$ acting as a [linear map](/page/Linear%20Map) from $(\mathbb{R}^p,|\cdot|)$ to itself, namely
\begin{align*}
|B|_{\mathrm{op}} := \sup\{|Bv| : v \in \mathbb{R}^p,\ |v|=1\}.
\end{align*}
For an invertible matrix $B \in \mathbb{R}^{p\times p}$, write $\|B^{-1}\|_{\mathrm{op}}$ for the same operator norm applied to $B^{-1}$. Let $\lambda_{\min}(Q)>0$ denote the smallest eigenvalue of the symmetric positive definite matrix $Q$. Define
\begin{align*}
\delta := \frac{\lambda_{\min}(Q)}{2}.
\end{align*}
If $A \in \mathbb{R}^{p\times p}$ is symmetric and $|A-Q|_{\mathrm{op}}<\delta$, then for every $v\in\mathbb{R}^p$ with $|v|=1$,
\begin{align*}
v^\top A v
=
v^\top Qv+v^\top(A-Q)v
\ge
\lambda_{\min}(Q)-|A-Q|_{\mathrm{op}}
>
\delta.
\end{align*}
Thus $A$ is positive definite and invertible. Moreover,
\begin{align*}
\|A^{-1}\|_{\mathrm{op}}
=
\frac{1}{\lambda_{\min}(A)}
\le
\frac{1}{\delta}
=
\frac{2}{\lambda_{\min}(Q)}.
\end{align*}
Since $|B|_{\mathrm{op}}\le |B|_F$ for every $B\in\mathbb{R}^{p\times p}$ and $S_n \xrightarrow{\mathbb{P}} Q$ in Frobenius norm, we have
\begin{align*}
\mathbb{P}\left(|S_n-Q|_{\mathrm{op}}<\delta\right)\to 1.
\end{align*}
On this event, $S_n$ is invertible and
\begin{align*}
\|S_n^{-1}\|_{\mathrm{op}}\le \frac{2}{\lambda_{\min}(Q)}.
\end{align*}[/step]
custom_env
admin
[guided]The only delicate point in the OLS formula is the inverse $S_n^{-1}$. We use the operator norm to measure how close a random matrix is to the population matrix. For a matrix $B \in \mathbb{R}^{p\times p}$, define
\begin{align*}
|B|_{\mathrm{op}} := \sup\{|Bv| : v \in \mathbb{R}^p,\ |v|=1\},
\end{align*}
which is the operator norm of $B$ as a linear map from $(\mathbb{R}^p,|\cdot|)$ to itself. For an invertible matrix $B \in \mathbb{R}^{p\times p}$, the notation $\|B^{-1}\|_{\mathrm{op}}$ means the same operator norm applied to the inverse linear map $B^{-1}:\mathbb{R}^p\to\mathbb{R}^p$.
The population matrix $Q$ is positive definite, so it is bounded away from singularity. Let $\lambda_{\min}(Q)>0$ be its smallest eigenvalue and set
\begin{align*}
\delta := \frac{\lambda_{\min}(Q)}{2}.
\end{align*}
Suppose $A$ is a symmetric matrix satisfying $|A-Q|_{\mathrm{op}}<\delta$. For any unit vector $v\in\mathbb{R}^p$, the quadratic form of $A$ satisfies
\begin{align*}
v^\top A v
=
v^\top Qv+v^\top(A-Q)v
\ge
\lambda_{\min}(Q)-|A-Q|_{\mathrm{op}}
>
\delta.
\end{align*}
This proves that $A$ is positive definite. A positive definite matrix is invertible, and its inverse has operator norm equal to the reciprocal of its smallest eigenvalue, so
\begin{align*}
\|A^{-1}\|_{\mathrm{op}}
=
\frac{1}{\lambda_{\min}(A)}
\le
\frac{1}{\delta}
=
\frac{2}{\lambda_{\min}(Q)}.
\end{align*}
We apply this deterministic fact to $A=S_n$. The convergence $S_n\to Q$ in probability was obtained in Frobenius norm, and the operator norm is bounded by the Frobenius norm:
\begin{align*}
|B|_{\mathrm{op}}\le |B|_F
\end{align*}
for every $B\in\mathbb{R}^{p\times p}$. Hence
\begin{align*}
\mathbb{P}\left(|S_n-Q|_{\mathrm{op}}<\delta\right)\to 1.
\end{align*}
On this high-probability event, $S_n$ is invertible and its inverse is uniformly controlled:
\begin{align*}
\|S_n^{-1}\|_{\mathrm{op}}\le \frac{2}{\lambda_{\min}(Q)}.
\end{align*}[/guided]
custom_env
admin
[step:Decompose the OLS error and prove convergence in probability]On the event that $S_n$ is invertible, we have
\begin{align*}
r_n
=
\frac{1}{n}\sum_{i=1}^n X_iY_i
=
\frac{1}{n}\sum_{i=1}^n X_i(X_i^\top\beta_0+u_i)
=
S_n\beta_0+m_n,
\end{align*}
where
\begin{align*}
m_n := \frac{1}{n}\sum_{i=1}^n X_i u_i.
\end{align*}
Therefore
\begin{align*}
\hat{\beta}_n-\beta_0
=
S_n^{-1}m_n
\end{align*}
on the event that $S_n$ is invertible.
Fix $\varepsilon>0$ and define the event
\begin{align*}
E_n := \left\{|S_n-Q|_{\mathrm{op}}<\delta\right\}.
\end{align*}
On $E_n$,
\begin{align*}
|\hat{\beta}_n-\beta_0|
\le
\|S_n^{-1}\|_{\mathrm{op}}|m_n|
\le
\frac{2}{\lambda_{\min}(Q)}|m_n|.
\end{align*}
Hence
\begin{align*}
\mathbb{P}\left(|\hat{\beta}_n-\beta_0|>\varepsilon\right)
\le
\mathbb{P}(E_n^c)
+
\mathbb{P}\left(
|m_n|>\frac{\varepsilon\lambda_{\min}(Q)}{2}
\right).
\end{align*}
The first term tends to $0$ by the previous step, and the second term tends to $0$ because $m_n\xrightarrow{\mathbb{P}}0$. Therefore
\begin{align*}
\hat{\beta}_n \xrightarrow{\mathbb{P}} \beta_0.
\end{align*}[/step]
custom_env
admin
[guided]We now use the regression equation to expose the error term. On the event that $S_n$ is invertible, the OLS estimator is $\hat{\beta}_n=S_n^{-1}r_n$. Since $Y_i=X_i^\top\beta_0+u_i$, we compute
\begin{align*}
r_n
=
\frac{1}{n}\sum_{i=1}^n X_iY_i
=
\frac{1}{n}\sum_{i=1}^n X_i(X_i^\top\beta_0+u_i)
=
\left(\frac{1}{n}\sum_{i=1}^n X_iX_i^\top\right)\beta_0
+
\frac{1}{n}\sum_{i=1}^n X_i u_i.
\end{align*}
Using the definitions
\begin{align*}
S_n := \frac{1}{n}\sum_{i=1}^n X_iX_i^\top,\qquad
m_n := \frac{1}{n}\sum_{i=1}^n X_i u_i,
\end{align*}
this becomes
\begin{align*}
r_n=S_n\beta_0+m_n.
\end{align*}
Multiplying by $S_n^{-1}$ on the event where $S_n$ is invertible gives
\begin{align*}
\hat{\beta}_n-\beta_0
=
S_n^{-1}r_n-\beta_0
=
S_n^{-1}(S_n\beta_0+m_n)-\beta_0
=
S_n^{-1}m_n.
\end{align*}
Now fix $\varepsilon>0$ and let
\begin{align*}
E_n := \left\{|S_n-Q|_{\mathrm{op}}<\delta\right\},
\end{align*}
where $\delta=\lambda_{\min}(Q)/2$. The previous step showed that $\mathbb{P}(E_n)\to 1$, and that on $E_n$ the matrix $S_n$ is invertible with
\begin{align*}
\|S_n^{-1}\|_{\mathrm{op}}\le \frac{2}{\lambda_{\min}(Q)}.
\end{align*}
Therefore, on $E_n$,
\begin{align*}
|\hat{\beta}_n-\beta_0|
=
|S_n^{-1}m_n|
\le
\|S_n^{-1}\|_{\mathrm{op}}|m_n|
\le
\frac{2}{\lambda_{\min}(Q)}|m_n|.
\end{align*}
Consequently,
\begin{align*}
\mathbb{P}\left(|\hat{\beta}_n-\beta_0|>\varepsilon\right)
\le
\mathbb{P}(E_n^c)
+
\mathbb{P}\left(
\frac{2}{\lambda_{\min}(Q)}|m_n|>\varepsilon
\right),
\end{align*}
that is,
\begin{align*}
\mathbb{P}\left(|\hat{\beta}_n-\beta_0|>\varepsilon\right)
\le
\mathbb{P}(E_n^c)
+
\mathbb{P}\left(
|m_n|>\frac{\varepsilon\lambda_{\min}(Q)}{2}
\right).
\end{align*}
The first probability tends to $0$ because $S_n$ is close to $Q$ with probability tending to $1$. The second tends to $0$ because the sample orthogonality moment $m_n$ converges to $0$ in probability. Since this holds for every $\varepsilon>0$, we conclude
\begin{align*}
\hat{\beta}_n \xrightarrow{\mathbb{P}} \beta_0.
\end{align*}[/guided]