[proofplan]
Write the least-squares error as the product of the inverse empirical Gram matrix and the empirical score. The empirical Gram matrix converges in probability to the positive definite matrix $Q$, so with probability tending to one it is invertible and its inverse is uniformly bounded. The empirical score has mean zero and variance of order $1/n$, hence converges in probability to $0$. Combining these two estimates gives convergence in probability of $\hat{\beta}_n-\beta$ to $0$, and the arbitrary definition on singular samples is irrelevant because those samples have probability tending to $0$.
[/proofplan]
custom_env
admin
[step:Express the least-squares error through the empirical Gram matrix and score]
For each $n \in \mathbb{N}$, define the empirical Gram matrix $A_n \in \mathbb{R}^{p \times p}$ and empirical score vector $b_n \in \mathbb{R}^p$ by
\begin{align*}
A_n := \frac{1}{n}X_n^\top X_n
= \frac{1}{n}\sum_{i=1}^{n} Z_i Z_i^\top,
\qquad
b_n := \frac{1}{n}X_n^\top \varepsilon_n
= \frac{1}{n}\sum_{i=1}^{n} Z_i \varepsilon_i,
\end{align*}
where $\varepsilon_n := (\varepsilon_1,\dots,\varepsilon_n)^\top \in \mathbb{R}^n$.
On the event
\begin{align*}
E_n := \{X_n^\top X_n \text{ is invertible}\},
\end{align*}
the matrix $A_n$ is invertible and
\begin{align*}
\hat{\beta}_n
&= (X_n^\top X_n)^{-1}X_n^\top (X_n\beta+\varepsilon_n) \\
&= \beta + (X_n^\top X_n)^{-1}X_n^\top \varepsilon_n \\
&= \beta + A_n^{-1}b_n.
\end{align*}
Thus, on $E_n$,
\begin{align*}
\hat{\beta}_n-\beta = A_n^{-1}b_n.
\end{align*}
[/step]
custom_env
admin
[step:Show that the empirical Gram matrix is invertible with probability tending to one]
For $1 \le j,k \le p$, the $(j,k)$ entry of $A_n$ is
\begin{align*}
(A_n)_{jk} = \frac{1}{n}\sum_{i=1}^{n} Z_{ij}Z_{ik}.
\end{align*}
Since $\mathbb{E}[Z_iZ_i^\top]=Q$, each scalar random variable $Z_{ij}Z_{ik}$ is integrable and has expectation $Q_{jk}$. By the [weak law of large numbers](/theorems/1127) for integrable scalar random variables (citing a result not yet in the wiki: [Weak Law of Large Numbers](/theorems/1851)),
\begin{align*}
(A_n)_{jk} \xrightarrow{\mathbb{P}} Q_{jk}.
\end{align*}
Because $p$ is fixed and finite, entrywise convergence implies convergence in operator norm:
\begin{align*}
\|A_n-Q\|_{\mathrm{op}} \xrightarrow{\mathbb{P}} 0.
\end{align*}
Let
\begin{align*}
\lambda_0 := \min_{|v|=1} v^\top Qv.
\end{align*}
Since $Q$ is positive definite, $\lambda_0>0$. Define
\begin{align*}
F_n := \left\{\|A_n-Q\|_{\mathrm{op}} < \frac{\lambda_0}{2}\right\}.
\end{align*}
Then $\mathbb{P}(F_n)\to 1$. If $F_n$ occurs and $v \in \mathbb{R}^p$ satisfies $|v|=1$, then
\begin{align*}
v^\top A_n v
&= v^\top Qv + v^\top(A_n-Q)v \\
&\geq \lambda_0 - \|A_n-Q\|_{\mathrm{op}} \\
&> \frac{\lambda_0}{2}.
\end{align*}
Therefore $A_n$ is positive definite on $F_n$, hence invertible on $F_n$. Since $A_n=n^{-1}X_n^\top X_n$, this implies $F_n \subset E_n$, and consequently
\begin{align*}
\mathbb{P}(E_n^c) \leq \mathbb{P}(F_n^c) \to 0.
\end{align*}
[/step]
custom_env
admin
[step:Bound the inverse Gram matrix on the high-probability event]
On $F_n$, the previous estimate gives
\begin{align*}
v^\top A_n v \geq \frac{\lambda_0}{2}|v|^2
\end{align*}
for every $v \in \mathbb{R}^p$. Since $A_n$ is symmetric and positive definite on $F_n$, its smallest eigenvalue is at least $\lambda_0/2$. Therefore its inverse satisfies the operator norm bound
\begin{align*}
\|A_n^{-1}\|_{\mathrm{op}} \leq \frac{2}{\lambda_0}
\end{align*}
on $F_n$.
[/step]
custom_env
admin
[step:Show that the empirical score converges to zero in probability]
For each coordinate $1 \le j \le p$,
\begin{align*}
\mathbb{E}[Z_{ij}\varepsilon_i]
&= \mathbb{E}\left[\mathbb{E}[Z_{ij}\varepsilon_i \mid Z_i]\right] \\
&= \mathbb{E}\left[Z_{ij}\mathbb{E}[\varepsilon_i \mid Z_i]\right] \\
&= 0.
\end{align*}
The integrability needed here follows from
\begin{align*}
\mathbb{E}[|Z_{ij}\varepsilon_i|]
\leq \mathbb{E}[|Z_i||\varepsilon_i|]
\leq \left(\mathbb{E}[|\varepsilon_i|^2|Z_i|^2]\right)^{1/2}
< \infty.
\end{align*}
Moreover,
\begin{align*}
\mathbb{E}[|Z_i\varepsilon_i|^2]
= \mathbb{E}[|\varepsilon_i|^2|Z_i|^2]
< \infty.
\end{align*}
Since the vectors $Z_i\varepsilon_i$ are i.i.d. with mean zero, independence gives
\begin{align*}
\mathbb{E}[|b_n|^2]
&= \mathbb{E}\left[\left|\frac{1}{n}\sum_{i=1}^{n}Z_i\varepsilon_i\right|^2\right] \\
&= \frac{1}{n^2}\sum_{i=1}^{n}\mathbb{E}[|Z_i\varepsilon_i|^2] \\
&= \frac{1}{n}\mathbb{E}[|Z_1\varepsilon_1|^2].
\end{align*}
For every $\eta>0$, [Chebyshev's inequality](/theorems/1126) gives
\begin{align*}
\mathbb{P}(|b_n|>\eta)
\leq \frac{\mathbb{E}[|b_n|^2]}{\eta^2}
= \frac{\mathbb{E}[|Z_1\varepsilon_1|^2]}{n\eta^2}
\to 0.
\end{align*}
Hence
\begin{align*}
b_n \xrightarrow{\mathbb{P}} 0.
\end{align*}
[/step]
custom_env
admin
[step:Combine the high-probability inverse bound with the score convergence]
Let $\eta>0$. Since $F_n \subset E_n$, on $F_n$ we have
\begin{align*}
|\hat{\beta}_n-\beta|
= |A_n^{-1}b_n|
\leq \|A_n^{-1}\|_{\mathrm{op}}|b_n|
\leq \frac{2}{\lambda_0}|b_n|.
\end{align*}
Therefore
\begin{align*}
\mathbb{P}(|\hat{\beta}_n-\beta|>\eta)
&\leq \mathbb{P}(F_n^c)
+ \mathbb{P}\left(\frac{2}{\lambda_0}|b_n|>\eta\right) \\
&= \mathbb{P}(F_n^c)
+ \mathbb{P}\left(|b_n|>\frac{\lambda_0\eta}{2}\right).
\end{align*}
The first term tends to $0$ because $A_n \xrightarrow{\mathbb{P}} Q$, and the second term tends to $0$ because $b_n \xrightarrow{\mathbb{P}}0$. Hence
\begin{align*}
\mathbb{P}(|\hat{\beta}_n-\beta|>\eta) \to 0
\end{align*}
for every $\eta>0$. This is precisely
\begin{align*}
\hat{\beta}_n \xrightarrow{\mathbb{P}} \beta.
\end{align*}
[/step]