[proofplan]
The proof is a conditional Gaussian regression argument. First we compute the conditional law of $X_t$ given $X_{t+1}$ and the observations $Y_{1:t}$, using the joint Gaussian law of $(X_t,X_{t+1})$ after the filtering pass. Then we condition this affine Gaussian representation on the full observation record $Y_{1:n}$ and use the Markov structure of the state-space model to replace future information by its effect on $X_{t+1}$. The resulting conditional mean and covariance are exactly the Rauch--Tung--Striebel backward recursions.
[/proofplan]
[step:Compute the backward conditional Gaussian regression]
Fix $t \in \{1,\dots,n-1\}$. Let $\mathcal{F}_t := \sigma(Y_1,\dots,Y_t)$ and $\mathcal{F}_n := \sigma(Y_1,\dots,Y_n)$. By the Kalman filtering hypotheses, conditional on $\mathcal{F}_t$, the random vector
\begin{align*}
Z_t :=
\begin{pmatrix}
X_t \\
X_{t+1}
\end{pmatrix}
\end{align*}
is Gaussian with conditional mean
\begin{align*}
m_t :=
\begin{pmatrix}
a_{t|t} \\
a_{t+1|t}
\end{pmatrix}
\end{align*}
and conditional covariance matrix
\begin{align*}
\Sigma_t :=
\begin{pmatrix}
P_{t|t} & P_{t|t}T_t^\top \\
T_tP_{t|t} & P_{t+1|t}
\end{pmatrix}.
\end{align*}
Since $P_{t+1|t}$ is invertible, the conditional Gaussian regression formula gives
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_t]
&= a_{t|t}+P_{t|t}T_t^\top P_{t+1|t}^{-1}(X_{t+1}-a_{t+1|t}) \\
&= a_{t|t}+J_t(X_{t+1}-a_{t+1|t}),
\end{align*}
and
\begin{align*}
\operatorname{Cov}(X_t \mid X_{t+1},\mathcal{F}_t)
&= P_{t|t}-P_{t|t}T_t^\top P_{t+1|t}^{-1}T_tP_{t|t} \\
&= P_{t|t}-J_tP_{t+1|t}J_t^\top.
\end{align*}
[guided]
Fix $t \in \{1,\dots,n-1\}$ and define the observation sigma-algebras
\begin{align*}
\mathcal{F}_t &:= \sigma(Y_1,\dots,Y_t), \\
\mathcal{F}_n &:= \sigma(Y_1,\dots,Y_n).
\end{align*}
The forward Kalman filter gives a conditional Gaussian law for $X_t$ given $\mathcal{F}_t$ and for the one-step prediction $X_{t+1}$ given $\mathcal{F}_t$. Because the transition from $X_t$ to $X_{t+1}$ is linear Gaussian, the pair
\begin{align*}
Z_t :=
\begin{pmatrix}
X_t \\
X_{t+1}
\end{pmatrix}
\end{align*}
is also conditionally Gaussian given $\mathcal{F}_t$. Its conditional mean is
\begin{align*}
m_t :=
\begin{pmatrix}
a_{t|t} \\
a_{t+1|t}
\end{pmatrix},
\end{align*}
and its conditional covariance is
\begin{align*}
\Sigma_t :=
\begin{pmatrix}
P_{t|t} & \operatorname{Cov}(X_t,X_{t+1}\mid \mathcal{F}_t) \\
\operatorname{Cov}(X_{t+1},X_t\mid \mathcal{F}_t) & P_{t+1|t}
\end{pmatrix}.
\end{align*}
By the hypothesis on the linear part of the state transition,
\begin{align*}
\operatorname{Cov}(X_t,X_{t+1}\mid \mathcal{F}_t)=P_{t|t}T_t^\top,
\end{align*}
and therefore
\begin{align*}
\Sigma_t =
\begin{pmatrix}
P_{t|t} & P_{t|t}T_t^\top \\
T_tP_{t|t} & P_{t+1|t}
\end{pmatrix}.
\end{align*}
Now apply the conditional Gaussian regression formula to the block vector $(X_t,X_{t+1})$ conditional on $\mathcal{F}_t$. The required invertibility hypothesis is precisely the assumption that $P_{t+1|t}$ is invertible. Hence
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_t]
&= a_{t|t}+\operatorname{Cov}(X_t,X_{t+1}\mid \mathcal{F}_t)P_{t+1|t}^{-1}(X_{t+1}-a_{t+1|t}) \\
&= a_{t|t}+P_{t|t}T_t^\top P_{t+1|t}^{-1}(X_{t+1}-a_{t+1|t}).
\end{align*}
Since the smoothing gain is defined by
\begin{align*}
J_t := P_{t|t}T_t^\top P_{t+1|t}^{-1},
\end{align*}
this becomes
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_t]
= a_{t|t}+J_t(X_{t+1}-a_{t+1|t}).
\end{align*}
The same Gaussian regression formula gives the conditional covariance
\begin{align*}
\operatorname{Cov}(X_t \mid X_{t+1},\mathcal{F}_t)
&= P_{t|t}
-\operatorname{Cov}(X_t,X_{t+1}\mid \mathcal{F}_t)P_{t+1|t}^{-1}
\operatorname{Cov}(X_{t+1},X_t\mid \mathcal{F}_t) \\
&= P_{t|t}-P_{t|t}T_t^\top P_{t+1|t}^{-1}T_tP_{t|t}.
\end{align*}
Using $J_t=P_{t|t}T_t^\top P_{t+1|t}^{-1}$ and the symmetry of $P_{t+1|t}$, this is
\begin{align*}
\operatorname{Cov}(X_t \mid X_{t+1},\mathcal{F}_t)
= P_{t|t}-J_tP_{t+1|t}J_t^\top.
\end{align*}
[/guided]
[/step]
[step:Condition the backward regression mean on the full observation record]
The state-space Markov property gives
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_n]
=
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_t].
\end{align*}
Taking conditional expectation with respect to $\mathcal{F}_n$ in the regression identity from the previous step yields
\begin{align*}
a_{t|n}
&= \mathbb{E}[X_t \mid \mathcal{F}_n] \\
&= \mathbb{E}\!\left[\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_n]\mid \mathcal{F}_n\right] \\
&= \mathbb{E}\!\left[a_{t|t}+J_t(X_{t+1}-a_{t+1|t})\mid \mathcal{F}_n\right] \\
&= a_{t|t}+J_t(a_{t+1|n}-a_{t+1|t}).
\end{align*}
[guided]
The future observations $Y_{t+1},\dots,Y_n$ affect $X_t$ only through their information about the later state $X_{t+1}$. Formally, the Markov property of the linear Gaussian state-space model implies
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_n]
=
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_t].
\end{align*}
This is the point where the state-space structure is used: once $X_{t+1}$ and the past observations $\mathcal{F}_t$ are known, the future observations add no further direct information about $X_t$.
Using the tower property of conditional expectation, we compute
\begin{align*}
a_{t|n}
&= \mathbb{E}[X_t \mid \mathcal{F}_n] \\
&= \mathbb{E}\!\left[\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_n]\mid \mathcal{F}_n\right].
\end{align*}
By the Markov property, the inner conditional expectation equals the backward regression mean already computed:
\begin{align*}
\mathbb{E}[X_t \mid X_{t+1},\mathcal{F}_n]
=
a_{t|t}+J_t(X_{t+1}-a_{t+1|t}).
\end{align*}
Therefore
\begin{align*}
a_{t|n}
&= \mathbb{E}\!\left[a_{t|t}+J_t(X_{t+1}-a_{t+1|t})\mid \mathcal{F}_n\right] \\
&= a_{t|t}+J_t\left(\mathbb{E}[X_{t+1}\mid \mathcal{F}_n]-a_{t+1|t}\right).
\end{align*}
Since
\begin{align*}
a_{t+1|n}:=\mathbb{E}[X_{t+1}\mid \mathcal{F}_n],
\end{align*}
we obtain
\begin{align*}
a_{t|n}=a_{t|t}+J_t(a_{t+1|n}-a_{t+1|t}).
\end{align*}
[/guided]
[/step]
[step:Condition the backward regression covariance on the full observation record]
Define the regression residual
\begin{align*}
\varepsilon_t := X_t-a_{t|t}-J_t(X_{t+1}-a_{t+1|t}).
\end{align*}
From the conditional Gaussian regression above,
\begin{align*}
\mathbb{E}[\varepsilon_t \mid X_{t+1},\mathcal{F}_t]=0,
\qquad
\operatorname{Cov}(\varepsilon_t \mid X_{t+1},\mathcal{F}_t)
=P_{t|t}-J_tP_{t+1|t}J_t^\top.
\end{align*}
The same Markov conditional independence implies that $\varepsilon_t$ is conditionally independent of $\mathcal{F}_n$ given $(X_{t+1},\mathcal{F}_t)$. Hence
\begin{align*}
\operatorname{Cov}(\varepsilon_t \mid \mathcal{F}_n)
=
P_{t|t}-J_tP_{t+1|t}J_t^\top,
\end{align*}
and the cross-covariance between $\varepsilon_t$ and $X_{t+1}$ conditional on $\mathcal{F}_n$ is zero.
Using the decomposition
\begin{align*}
X_t-a_{t|n}
=
\varepsilon_t+J_t(X_{t+1}-a_{t+1|n}),
\end{align*}
we obtain
\begin{align*}
P_{t|n}
&= \operatorname{Cov}(X_t\mid \mathcal{F}_n) \\
&= \operatorname{Cov}(\varepsilon_t\mid \mathcal{F}_n)
+J_t\operatorname{Cov}(X_{t+1}\mid \mathcal{F}_n)J_t^\top \\
&= P_{t|t}-J_tP_{t+1|t}J_t^\top+J_tP_{t+1|n}J_t^\top \\
&= P_{t|t}+J_t(P_{t+1|n}-P_{t+1|t})J_t^\top.
\end{align*}
[guided]
Define the regression residual
\begin{align*}
\varepsilon_t := X_t-a_{t|t}-J_t(X_{t+1}-a_{t+1|t}).
\end{align*}
This residual is the part of $X_t$ that remains after the best affine prediction from $X_{t+1}$ and the information $\mathcal{F}_t$. From the conditional Gaussian regression computation,
\begin{align*}
\mathbb{E}[\varepsilon_t \mid X_{t+1},\mathcal{F}_t]=0
\end{align*}
and
\begin{align*}
\operatorname{Cov}(\varepsilon_t \mid X_{t+1},\mathcal{F}_t)
=P_{t|t}-J_tP_{t+1|t}J_t^\top.
\end{align*}
Because the joint law is Gaussian, this residual is conditionally independent of $X_{t+1}$ given $\mathcal{F}_t$. The state-space Markov property also says that the future observations give no additional direct information about this residual once $X_{t+1}$ and $\mathcal{F}_t$ are known. Thus conditioning on the full observation record preserves the residual covariance:
\begin{align*}
\operatorname{Cov}(\varepsilon_t \mid \mathcal{F}_n)
=
P_{t|t}-J_tP_{t+1|t}J_t^\top.
\end{align*}
Moreover, the conditional cross-covariance of $\varepsilon_t$ with $X_{t+1}$ given $\mathcal{F}_n$ is zero:
\begin{align*}
\operatorname{Cov}(\varepsilon_t,X_{t+1}\mid \mathcal{F}_n)=0.
\end{align*}
Now subtract the smoothing mean formula from the definition of $\varepsilon_t$. Since
\begin{align*}
a_{t|n}=a_{t|t}+J_t(a_{t+1|n}-a_{t+1|t}),
\end{align*}
we have
\begin{align*}
X_t-a_{t|n}
&= X_t-a_{t|t}-J_t(a_{t+1|n}-a_{t+1|t}) \\
&= \varepsilon_t+J_t(X_{t+1}-a_{t+1|t})-J_t(a_{t+1|n}-a_{t+1|t}) \\
&= \varepsilon_t+J_t(X_{t+1}-a_{t+1|n}).
\end{align*}
Taking conditional covariance given $\mathcal{F}_n$ and using the vanishing cross-covariance gives
\begin{align*}
P_{t|n}
&= \operatorname{Cov}(X_t\mid \mathcal{F}_n) \\
&= \operatorname{Cov}(\varepsilon_t+J_t(X_{t+1}-a_{t+1|n})\mid \mathcal{F}_n) \\
&= \operatorname{Cov}(\varepsilon_t\mid \mathcal{F}_n)
+J_t\operatorname{Cov}(X_{t+1}\mid \mathcal{F}_n)J_t^\top.
\end{align*}
By definition,
\begin{align*}
P_{t+1|n}:=\operatorname{Cov}(X_{t+1}\mid \mathcal{F}_n),
\end{align*}
so
\begin{align*}
P_{t|n}
&= P_{t|t}-J_tP_{t+1|t}J_t^\top+J_tP_{t+1|n}J_t^\top \\
&= P_{t|t}+J_t(P_{t+1|n}-P_{t+1|t})J_t^\top.
\end{align*}
[/guided]
[/step]
[step:Run the recursion backward from the terminal filtering distribution]
At the terminal time $n$, the smoothing distribution equals the filtering distribution because no observations occur after time $n$:
\begin{align*}
a_{n|n}=\mathbb{E}[X_n\mid Y_{1:n}],
\qquad
P_{n|n}=\operatorname{Cov}(X_n\mid Y_{1:n}).
\end{align*}
Applying the two identities above successively for $t=n-1,n-2,\dots,1$ gives
\begin{align*}
a_{t|n} &= a_{t|t}+J_t(a_{t+1|n}-a_{t+1|t}), \\
P_{t|n} &= P_{t|t}+J_t(P_{t+1|n}-P_{t+1|t})J_t^\top,
\end{align*}
for every $t \in \{1,\dots,n-1\}$. This is the Rauch--Tung--Striebel smoothing recursion.
[/step]