$$
L(\mu_{1},\mu_{2},\Sigma,\phi)=\sum\limits_{i=1}^{N}[\underbrace{\log N(\mu_{1},\Sigma)^{y_{i}}}{(1)}+\underbrace{\log N(\mu{2},\Sigma)^{1-y_{i}}}{(2)}+\underbrace{\log \phi^{y{i}}(1-\phi)^{1-y_{i}}}_{(3)}]
$$
求$\phi$,显然只有$(3)$与$\phi$相关
$$
\begin{aligned}
(3)&=\sum\limits_{i=1}^{N}\log \phi^{y_{i}}(1-\phi)^{1-y_{i}}\
&=\sum\limits_{i=1}^{N}[y_{i} \log \phi+(1-y_{i})\log(1-\phi)]\
\frac{\partial (3)}{\partial \phi}&=\sum\limits_{i=1}^{N}\left[y_{i}\cdot \frac{1}{\phi}-\left(1-y_{i}\right) \frac{1}{1-\phi}\right]=0\
0&=\sum\limits_{i=1}^{N}[y_{i}\cdot (1-\phi)-(1-y_{i})\phi]\
0&=\sum\limits_{i=1}^{N}(y_{i}-y_{i}\phi-\phi+y_{i}\phi)\
0&=\sum\limits_{i=1}^{N}(y_{i}-\phi)\
0&=\sum\limits_{i=1}^{N}y_{i}+N \phi\
\hat{\phi}&= \frac{\sum\limits_{i=1}^{N}y_{i}}{N}
\end{aligned}
$$
求$\mu_{1}$,显然只有$(1)$与$\mu_{1}$相关。对于$\mu_{2}$类似于$\mu_{1}$,只需要$1-y_{i}$替换$y_{i}$即可
$$
\begin{aligned}
(1)&=\sum\limits_{i=1}^{N}\log N(\mu_{1},\Sigma)^{y_{i}}\
&=\sum\limits_{i=1}^{N}y_{i}\log \frac{1}{(2\pi)^{\frac{p}{2}}|\Sigma|^{\frac{1}{2}}}\text{exp}\left[ - \frac{1}{2}(x_{i}-\mu_{1})^{T}\Sigma^{-1}(x_{i}-\mu_{1})\right]\
\mu_{1}&=\mathop{argmax\space}\limits_{\mu_{1}}(1)\
&=\mathop{argmax\space}\limits_{\mu_{1}}\sum\limits_{i=1}^{N}y_{i}\left[ - \frac{1}{2}(x_{i}-\mu_{1})^{T}\Sigma^{-1}(x_{i}-\mu_{1})\right]\
&=\mathop{argmax\space}\limits_{\mu_{1}}- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(x_{i}^{T}\Sigma^{-1}-\mu_{1}^{T}\Sigma^{-1})(x_{i}-\mu_{1})\
&=\mathop{argmax\space}\limits_{\mu_{1}}- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(\underbrace{x_{i}^{T}\Sigma^{-1}x_{i}}{\in \mathbb{R}}-\underbrace{x{i}^{T}\Sigma^{-1}\mu_{1}}{1 \times 1}-\underbrace{\mu{1}^{T}\Sigma^{-1}x_{i}}{1 \times 1}+\mu{1}^{T}\Sigma^{-1}\mu_{1})\
&=\mathop{argmax\space}\limits_{\mu_{1}}\underbrace{- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(x_{i}^{T}\Sigma^{-1}x_{i}-2\mu_{1}^{T}\Sigma^{-1}x_{i}+\mu_{1}^{T}\Sigma^{-1}\mu_{1})}_{\Delta }\
\frac{\partial \Delta }{\partial \mu_{1}}&=- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(-2\Sigma^{-1}x_{i}+2\Sigma^{-1}\mu_{1})=0\
0&=\sum\limits_{i=1}^{N}y_{i}(\Sigma^{-1}\mu_{1}-\Sigma^{-1}x_{i})\
0&=\sum\limits_{i=1}^{N}y_{i}(\mu_{1}-x_{i})\
\sum\limits_{i=1}^{N}y_{i}\mu_{1}&=\sum\limits_{i=1}^{N}y_{i}x_{i}\
\hat{\mu_{1}}&=\frac{\sum\limits_{i=1}^{N}y_{i}x_{i}}{\sum\limits_{i=1}^{N}y_{i}}
\end{aligned}
$$
这里我们设
$$
\begin{aligned}
C_{1}&=\left{x_{i}|y_{i}=1,i=1,2,\cdots,N\right},|C_{1}|=N_{1}\
C_{0}&=\left{x_{i}|y_{i}=0,i=1,2,\cdots,N\right},|C_{0}|=N_{0}\
N&=N_{1}+N_{0}
\end{aligned}
$$
因此
$$
\hat{\mu_{1}}=\frac{\sum\limits_{i=1}^{N}y_{i}x_{i}}{N_{1}}
$$
再用$1-y_{i}$替换$y_{i}$得$\hat{\mu_{2}}$
$$
\hat{\mu_{2}}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{\sum\limits_{i=1}^{N}(1-y_{i})}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{N-N_{1}}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{N_{0}}
$$
求$\Sigma$,显然只有$(1),(2)$与$\Sigma$相关
$$
\begin{aligned}
(1)+(2)&=\sum\limits_{i=1}^{N}y_{i}\log N(\mu_{1},\Sigma)+\sum\limits_{i=1}^{N}(1-y_{i})\log N(\mu_{2},\Sigma)\
&=\sum\limits_{x_{i}\in C_{1}}^{}\log(\mu_{1},\Sigma)+\sum\limits_{x_{i}\in C_{2}}^{}\log N(\mu_{2},\Sigma)\
\sum\limits_{i=1}^{N}\log N(\mu,\Sigma)&=\sum\limits_{i=1}^{N} \frac{1}{(2\pi)^{\frac{p}{2}}|\Sigma|^{\frac{1}{2}}}\text{exp}\left[- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\right]\
&=\sum\limits_{i=1}^{N}\left[\log \frac{1}{\left(2\pi\right)^{\frac{p}{2}}}+ \log |\Sigma|^{\frac{1}{2}}+\left(- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}- \mu)\right)\right]\
&=\sum\limits_{i=1}^{N}\left[C - \frac{1}{2}\log|\Sigma|- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\right]\
&=C- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}\underbrace{\sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)}_{\in \mathbb{R}}\
\sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)&=\sum\limits_{i=1}^{N}\text{tr }[(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)]\
&=\sum\limits_{i=1}^{N}\text{tr }[(x_{i}-\mu)(x_{i}-\mu)^{T}\Sigma^{-1}]\
&=\text{tr }\left[\underbrace{\sum\limits_{i=1}^{N}(x_{i}-\mu)(x_{i}-\mu)^{T}}{x{i}的方差S}\Sigma^{-1}\right]\
&设S= \frac{1}{N}\sum\limits_{i=1}^{N}(x_{i}-\mu)(x_{i}-\mu)^{T}\
&=N \cdot \text{tr }(S \Sigma^{-1})\
&带回\sum\limits_{i=1}^{N}\log N(\mu,\Sigma)\
\sum\limits_{i=1}^{N}\log N(\mu,\Sigma)&=C- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}\sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\
&=- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})+C\
&带回(1)+(2)\
(1)+(2)&=- \frac{1}{2}N_{1}\log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})- \frac{1}{2}N_{2}\log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S_{2}\Sigma^{-1})+C\
&=- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S_{2}\Sigma^{-1})- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})+C \
&=- \frac{1}{2}[N \log|\Sigma|+ N_{1}\text{tr }(S_{1}\Sigma^{-1})+N_{2}\text{tr }(S_{2}\Sigma^{-1})]+C\
\frac{\partial (1)+(2)}{\partial \Sigma}&=- \frac{1}{2}(N \cdot \frac{1}{|\Sigma|}|\Sigma|\Sigma^{-1}-N_{1}S_{1}\Sigma^{-1}\Sigma^{-1}-N_{2}S_{2}\Sigma^{-1}\Sigma^{-1})=0\
N \Sigma-N_{1}S_{1}-N_{2}S_{2}&=0\
\hat{\Sigma}&=\frac{1}{N}(N_{1}S_{1}+N_{2}S_{2})
\end{aligned}
$$
迹的性质
$$\begin{aligned} \text{tr }(AB)&=\text{tr }(BA)\\text{tr }(ABC)&=\text{tr }(CAB)=\text{tr }(BCA)\end{aligned}$$
矩阵求导
$$\begin{aligned} \frac{\partial \text{tr }(AB)}{\partial A}&=B^{-1}\\frac{\partial |A|}{\partial A}&=|A|\cdot A^{T}\end{aligned}$$
图中圆圈代表正样本,叉号代表负样本,直线p(y = 1|x) = 0.5代表分类边界(decision boundary)。因为Σ相同所以两个形状相同,但是具有不同的μ 。
作者:张文翔
链接:Andrew Ng Stanford机器学习公开课 总结(5) - 张文翔的博客 | BY ZhangWenxiang (demmon-tju.github.io)