R python C++的 student T-test p-value计算

时间:2022-03-10 20:04:36

首先,干数据分析,至少需要会三种语言。

1. 用于绘图、高级数学计算的高级语言 (R SAS Matlab 其一) 

2. 用于文本处理、转换格式的脚本语言 (perl python ruby php + bash + awk)

3. 用于快速运算、复杂算法运算的低级语言   (C/C++ Java)


当然这一点并不绝对。比如本人认为目前的 python 在合理使用扩展包的情况下,完全可以胜任以上三种任务。


用本人的 mbp ,用  R Python C++ 各写了两个代码,计算 student- T test 的 p-value

首先是使用 R语言,apply p_value 在data_frame中:

Program1 R - apply

setwd("~/Documents/【】Computer-science/R/李程02_r_matrix")
mm_data <- read.delim("GSE6477_series_matrix_expr.txt.xls", row.names=1)
mm_info <- read.delim("GSE6477_sample _info.xls", row.names=1)

mm.group <- mm_info$Type == "New"     # new MM patients
normal.group <- mm_info$Type == "Normal donor"  # normal samples

mm_data.sub <- mm_data[1:10, ]  # test new code using a subset
apply(mm_data.sub, 1, t.test) # run t-test for each row (1 for row)
# but it's one sample t-test

one.t.test <- function(vec) {
  t.test(vec[normal.group], vec[mm.group])$p.value
}

system.time(  # time the computation
  p_vector1 <- apply(mm_data, 1, one.t.test)
)

如果不用 apply 逐行计算,

Program2 R - byline

setwd("~/Documents/【】Computer-science/R/李程02_r_matrix")
mm_data <- read.delim("GSE6477_series_matrix_expr.txt.xls", row.names=1)
mm_info <- read.delim("GSE6477_sample _info.xls", row.names=1)

mm.group <- mm_info$Type == "New"     # new MM patients
normal.group <- mm_info$Type == "Normal donor"  # normal samples
mm.vec <- as.numeric(mm_data[1, mm.group])
normal.vec <- as.numeric(mm_data[1, normal.group])
t_test_result <- t.test(normal.vec, mm.vec)

mm_data.sub <- mm_data[1:10, ]  # test new code using a subset
apply(mm_data.sub, 1, t.test) # run t-test for each row (1 for row)
# but it's one sample t-test

one.t.test <- function(vec) {
  t.test(vec[normal.group], vec[mm.group])$p.value
}

p_vector2<-NULL
system.time(
  for (i in c(1:length(rownames(mm_data)))){
    p_vector2[[i]]<-apply(mm_data[i,],1,one.t.test)
  }
)

write.table(p_vector2,"test2.result.xls",sep="\t",quote=FALSE)

如果用 python,方法是逐行读取,或者将各行的 两组数值放入一个 

( [1,2,3] , [5,6,7,8] 

     .......

     [1,3,4], [5,6,9,10] ) 

这样的 2d-numpy.array 中,最后用2d-numpy.array 计算correlation

或者逐行计算correlation

2d-numpy array算法:

Program3 Python numpy-2D array:

from __future__ import division
import numpy as np
from scipy import stats
import sys

def show_help():
   print >>sys.stderr,      "\n\tpython ",sys.argv[0],"GSE6477_sample_info.xls GSE6477_series_matrix_expr.txt.xls >../test3.result.xls"

def load_samp_type(sam_info,samp_type):
   f_saminfo = open(sam_info,"r")
   f_saminfo.readline()
   for line in f_saminfo:
      line = line.strip('\n')
      f  = line.split('\t')
      samp_type[f[0]] = f[2]
   f_saminfo.close()

def main():
   try:
      sam_info = sys.argv[1]
      sam_exp  = sys.argv[2]

   except IndexError:
      show_help()
      sys.exit(1)

   samp_type = {}
   load_samp_type(sam_info,samp_type)

   f_sam_exp = open(sam_exp,"r")
   in_h = f_sam_exp.readline()
   in_h = in_h.strip('\n')
   in_h = in_h.strip('\r')
   f_h  = in_h.split('\t')
   samp = f_h[1:]
   sam_norm = []
   sam_new  = []
#   print samp
   for i,sam in enumerate(samp):
      if   samp_type[sam] == 'New':
         sam_new.append(i+1)
      elif samp_type[sam] == 'Normal donor':
         sam_norm.append(i+1)

   l_samp_new  = []
   l_samp_norm = []
   l_gene      = []
   for line in f_sam_exp:
      line = line.strip('\n')
      line = line.strip('\r')
      f    = line.split('\t')
      l_gene.append(f[0])
      ll_new  = []
      ll_norm = []
      for i1 in sam_new:
         ll_new.append(float(f[i1]))
      for i2 in sam_norm:
         ll_norm.append(float(f[i2]))
      l_samp_new.append(ll_new)
      l_samp_norm.append(ll_norm)
      np_ll_new  = np.array(ll_new)
      np_ll_norm = np.array(ll_norm)
#      t,pval = stats.ttest_ind(np.array(ll_new),np.array(ll_norm))
#      print t,pval,line

   f_sam_exp.close()

   np_samp_new  = np.array(l_samp_new)
   np_samp_norm = np.array(l_samp_norm)

   l_pval = stats.ttest_ind(np_samp_new.T,np_samp_norm.T,equal_var=False)[1]


   for i in range(0,len(l_gene)):
      print l_gene[i],l_pval[i]

if __name__ == '__main__':
   main()


逐行计算代码:
Program4 Python by-line :

from __future__ import division
import numpy as np
from scipy import stats
import sys

def show_help():
   print >>sys.stderr,      "\n\tpython ",sys.argv[0],"GSE6477_sample_info.xls GSE6477_series_matrix_expr.txt.xls >../test3.result.xls"

def load_samp_type(sam_info,samp_type):
   f_saminfo = open(sam_info,"r")
   f_saminfo.readline()
   for line in f_saminfo:
      line = line.strip('\n')
      f  = line.split('\t')
      samp_type[f[0]] = f[2]
   f_saminfo.close()

def main():
   try:
      sam_info = sys.argv[1]
      sam_exp  = sys.argv[2]

   except IndexError:
      show_help()
      sys.exit(1)

   samp_type = {}
   load_samp_type(sam_info,samp_type)

   f_sam_exp = open(sam_exp,"r")
   in_h = f_sam_exp.readline()
   in_h = in_h.strip('\n')
   in_h = in_h.strip('\r')
   f_h  = in_h.split('\t')
   samp = f_h[1:]
   sam_norm = []
   sam_new  = []
#   print samp
   for i,sam in enumerate(samp):
      if   samp_type[sam] == 'New':
         sam_new.append(i+1)
      elif samp_type[sam] == 'Normal donor':
         sam_norm.append(i+1)

   for line in f_sam_exp:
      line = line.strip('\n')
      line = line.strip('\r')
      f    = line.split('\t')
      ll_new  = []
      ll_norm = []
      for i1 in sam_new:
         ll_new.append(float(f[i1]))
      for i2 in sam_norm:
         ll_norm.append(float(f[i2]))
      np_ll_new  = np.array(ll_new)
      np_ll_norm = np.array(ll_norm)
#      print f[0],np.mean(np_ll_new),np.std(np_ll_new),np.mean(np_ll_norm),np.std(np_ll_norm)
      t,pval = stats.ttest_ind(np.array(ll_new),np.array(ll_norm),equal_var=False)
      print f[0],pval

   f_sam_exp.close()

if __name__ == '__main__':
   main()

使用 c++ boost 库计算:

Program4 C++ boost :

g++ 编译方法:

g++ -O -o study-licheng4 main.cpp

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <functional>
#include <numeric>
#include <map>
#include <set>
#include <cmath>
#include <inttypes.h>
#include <boost/unordered_map.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/math/distributions/students_t.hpp>

using namespace std;

double two_samples_t_test_unequal_sd(
        double Sm1,
        double Sd1,
        unsigned Sn1,
        double Sm2,
        double Sd2,
        unsigned Sn2)
{
   //
   // Sm1 = Sample Mean 1.
   // Sd1 = Sample Standard Deviation 1.
   // Sn1 = Sample Size 1.
   // Sm2 = Sample Mean 2.
   // Sd2 = Sample Standard Deviation 2.
   // Sn2 = Sample Size 2.
   //
   // A Students t test applied to two sets of data.
   // We are testing the null hypothesis that the two samples have the same mean and that any difference if due to chance.

   using namespace boost::math;

   // Degrees of freedom:
   double v = Sd1 * Sd1 / Sn1 + Sd2 * Sd2 / Sn2;
   v *= v;
   double t1 = Sd1 * Sd1 / Sn1;
   t1 *= t1;
   t1 /=  (Sn1 - 1);
   double t2 = Sd2 * Sd2 / Sn2;
   t2 *= t2;
   t2 /= (Sn2 - 1);
   v /= (t1 + t2);
//   cout << setw(55) << left << "Degrees of Freedom" << "=  " << v << "\n";
   // t-statistic:

	double t_stat = (Sm1 - Sm2) / sqrt(Sd1 * Sd1 / Sn1 + Sd2 * Sd2 / Sn2);

   //
   // Define our distribution, and get the probability:
   //
   students_t dist(v);
   double q = cdf(complement(dist, fabs(t_stat)));
	return 2*q;
}

void usage()
{
   cout << "./study-licheng4 <samp_info> 					<samp_matrix> " << endl;
   cout << "./study-licheng4 GSE6477_sample_info.xls	GSE6477_series_matrix_expr.txt.xls " << endl;
	cout << "  -h        get help information"   << endl;

   exit (0);
} // void usage

void load_samp_type(
	string f_samp_info,
	boost::unordered_map< string,string > & M_samp_type)
{
	ifstream infile;
	infile.open(f_samp_info.c_str());
	if ( ! infile )
	{
		cerr << "fail to open input file" << f_samp_info << endl;
      exit(0);
	}
	string lineStr;
	getline(infile,lineStr,'\n');
	while (getline(infile,lineStr,'\n'))
	{
		if (lineStr[0] == ' ' || lineStr[0] == '\n'){
		   continue;
		}
		vector<string> LineVec;
		boost::split(LineVec,lineStr,boost::is_any_of("\t\r\n"),boost::token_compress_on);
		M_samp_type[LineVec[0]] = LineVec[2];
//		cout << LineVec[0] << "\t" << LineVec[2] << "\t" << M_samp_type[LineVec[0]] << endl;
	}
} // void load_samp_type


int main(int argc,char *argv[])
{
	int c;
	while ( (c=getopt(argc,argv,"h")) !=-1 )
	{
		switch(c){
			case 'h' : usage();break;
			default : usage();
		}
	}
	if (argc <2) usage();
	string f_samp_info   = argv[optind++];
	string f_samp_matrix = argv[optind++];
	boost::unordered_map< string,string > M_samp_type;

	load_samp_type(f_samp_info,M_samp_type);

	ifstream infile;
	infile.open(f_samp_matrix.c_str());
	if (! infile){
		cerr << "fail to open input file " << f_samp_matrix << endl;
		exit(0);
	}

	string lineStr_h;
	getline(infile,lineStr_h,'\n');
	vector<string> LineVec_h;
	boost::split(LineVec_h,lineStr_h,boost::is_any_of("\t\r\n"),boost::token_compress_on);
	vector<string> V_samp;

	for ( int i=1; i < LineVec_h.size()-1; i++ ){
		V_samp.push_back(LineVec_h[i]);
	}

	vector<double> V_sam_new;
	vector<double> V_sam_norm;

	for ( int i=0; i < V_samp.size(); i++ ){
		if ( M_samp_type[V_samp[i]] ==  "New" ){
			V_sam_new.push_back(i+1);
		}
		else{
			if ( M_samp_type[V_samp[i]] == "Normal donor" ){
				V_sam_norm.push_back(i+1);
			}
		}
	}

	string lineStr;
   while  (getline(infile,lineStr,'\n'))
	{
		if (lineStr[0] == ' ' || lineStr[0] == '\n'){
		   continue;
		}
		vector<string> LineVec;
		boost::split(LineVec,lineStr,boost::is_any_of("\t\r\n"),boost::token_compress_on);
		string gene = LineVec[0];

		vector<double> V_new;
		vector<double> V_norm;

		for (int i=0; i<V_sam_new.size();  i++){
			int i1 = V_sam_new[i];
			V_new.push_back( boost::lexical_cast<double>(LineVec[i1]) );
		}
		for (int i=0; i<V_sam_norm.size(); i++){
			int i2 = V_sam_norm[i];
			V_norm.push_back( boost::lexical_cast<double>(LineVec[i2]) );
		}

		double sum_new =  accumulate(V_new.begin(), V_new.end(), 0.0);
		double mean_new = sum_new / V_new.size();
		vector<double> diff1(V_new.size());
		transform( V_new.begin(),  V_new.end(), diff1.begin(),  bind2nd(minus<double>(),mean_new ) );
		double sq_sum_new = inner_product(diff1.begin(), diff1.end(), diff1.begin(), 0.0);
		double stdev_new =  sqrt(sq_sum_new / (V_new.size() - 1) );
		unsigned cnt_new = boost::lexical_cast<unsigned>(V_new.size());

		double sum_norm =  accumulate(V_norm.begin(), V_norm.end(), 0.0);
		double mean_norm = sum_norm / V_norm.size();

		vector<double> diff2(V_norm.size());
		transform( V_norm.begin(), V_norm.end(), diff2.begin(), bind2nd(minus<double>(),mean_norm) );
		double sq_sum_norm = inner_product(diff2.begin(), diff2.end(), diff2.begin(), 0.0);
		double stdev_norm =  sqrt(sq_sum_norm / (V_norm.size() - 1) );
		unsigned cnt_norm = boost::lexical_cast<unsigned>(V_norm.size());

		cout << "\n" << gene << "\t" << mean_new << "\t" << stdev_new << "\t" << cnt_new << "\t" << mean_norm << "\t" << stdev_norm << "\t" << cnt_norm << endl;

		double p_value = two_samples_t_test_unequal_sd( mean_new, stdev_new, cnt_new, mean_norm, stdev_norm, cnt_norm );
		cout << gene << "\t" << p_value << endl;
	}
	infile.close();

}// int main

最后,可以用 Rcpp写 R 的 c++ 插件,加速 R 的运算。R 本身对C++高度兼容,这一点弥补了其计算速度慢的这一大缺点。

Program 6 Rcpp

setwd("~/Documents/【】Computer-science/R/李程02_r_matrix")

Rcpp::sourceCpp('~/Documents/【】Computer-science/R/李程02_r_matrix/test/rcpp_hello_world.cpp')

mm_data <- read.delim("GSE6477_series_matrix_expr.txt.xls", row.names=1)
mm_info <- read.delim("GSE6477_sample _info.xls", row.names=1)

mm.group <- mm_info$Type == "New"     # new MM patients
normal.group <- mm_info$Type == "Normal donor"  # normal samples

one.t.test <- function(vec) {
  p_value(as.numeric(vec[normal.group]), as.numeric(vec[mm.group]))
}


p_vector3<-NULL
system.time(  # time the computation
  p_vector3<-apply(mm_data, 1, one.t.test)
)

write.table(p_vector3,"test5.result.xls",sep="\t",quote=FALSE)

该程序调用的 rcpp_hello_world.cpp 如下:

#include <Rcpp.h>
#include <boost/math/distributions/students_t.hpp>

using namespace Rcpp;

// [[Rcpp::export]]
double p_value(NumericVector V_new, NumericVector V_norm){

  using namespace boost::math;

  int Sn1 = V_new.size();
  int Sn2 = V_norm.size();
  double Sm1 = mean(V_new);
  double Sm2 = mean(V_norm);
  double Sd1 = sd(V_new);
  double Sd2 = sd(V_norm);

  double v = Sd1 * Sd1 / Sn1 + Sd2 * Sd2 / Sn2;
  v *= v;
  double t1 = Sd1 * Sd1 / Sn1;
  t1 *= t1;
  t1 /=  (Sn1 - 1);
  double t2 = Sd2 * Sd2 / Sn2;
  t2 *= t2;
  t2 /= (Sn2 - 1);
  v /= (t1 + t2);
  double t_stat = (Sm1 - Sm2) / sqrt(Sd1 * Sd1 / Sn1 + Sd2 * Sd2 / Sn2);
  students_t dist(v);
  double q = cdf(complement(dist, fabs(t_stat)));

  return 2*q;
}


以上 六个 程序在本人 mbp 的运行时间:

real 0m10.780s
user 0m10.475s
sys 0m0.170s


real 1m16.729s
user 1m16.222s
sys 0m0.402s


real 0m1.737s
user 0m1.564s
sys 0m0.154s


real 0m6.881s
user 0m6.818s
sys 0m0.059s


real 0m1.067s
user 0m0.949s
sys 0m0.094s


real 0m12.077s
user 0m10.345s
sys 0m0.567s


可见 速度  C++ > Python > R 。

同时应注意,在尚未使用任何 ipython 魔法加速计算的情况下、在未使用cython的情况下,合理使用的python 的 numpy scipy 其计算速度已经十分接近 c++ 了,时间差距只有不到2倍,远高于 R 的10倍(apply)。同时 R 在不apply 的情况下,使用 for循环速度慢的简直坑爹,是c++ 用时的一百倍。


此外注意到最后一个程序(Rcpp)虽然占用了12s,比不用Rcpp还慢。但是仔细看下 R 的系统时间 log ,发现实际上最后一个程序的时间主要花在了 source(Rcpp)上。抛去其编译时间,只看三个R 程序的计算pvalue用时,如下:

 用户  系统  流逝
5.596 0.045 5.658


  用户   系统   流逝
71.373  0.277 71.667

 用户  系统  流逝
0.632 0.064 0.696

可见用 Rcpp 的计算速度是 R-apply t-test 的 10 倍,同时是 R用 for 循环的 100 倍。在不考虑编译、IO的情况下,速度接近 c++的计算,而其代码却简单了不少,省去了复杂的文本输入、关联数组,省去了需要小心数据计算方差、平均值时的 过大、过小溢出方面的考虑。因此对于R的大数据计算,非常推荐用 Rcpp 写程序。