pandas 选取数据 iloc和 loc的用法不太一样,iloc是根据索引, loc是根据行的数值
>>> import pandas as pd>>> import os
>>> os.chdir("D:\\")
>>> d = pd.read_csv("GWAS_water.qassoc", delimiter= "\s+")
>>> d.loc[1:3]
CHR SNP BP NMISS BETA SE R2 T P
1 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.3185
2 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.2665
3 1 . 452 44 0.1800 0.1783 0.02369 1.009 0.3185
>>> d.loc[0:3]
CHR SNP BP NMISS BETA SE R2 T P
0 1 . 410 44 0.2157 0.1772 0.03406 1.217 0.2304
1 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.3185
2 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.2665
3 1 . 452 44 0.1800 0.1783 0.02369 1.009 0.3185
>>> d.iloc[0:3]
CHR SNP BP NMISS BETA SE R2 T P
0 1 . 410 44 0.2157 0.1772 0.03406 1.217 0.2304
1 1 . 447 44 0.1800 0.1783 0.02369 1.009 0.3185
2 1 . 449 44 0.2785 0.2473 0.02931 1.126 0.2665
>>> d.iloc[1:3,2]
1 447
2 449
Name: BP, dtype: int64
>>> d.iloc[0:3,2]
0 410
1 447
2 449
Name: BP, dtype: int64
>>> d.head()
CHR SNP BP NMISS BETA SE R2 T P
0 1 . 410 44 0.2157 0.1772 0.03406 1.2170 0.2304
1 1 . 447 44 0.1800 0.1783 0.02369 1.0090 0.3185
2 1 . 449 44 0.2785 0.2473 0.02931 1.1260 0.2665
3 1 . 452 44 0.1800 0.1783 0.02369 1.0090 0.3185
4 1 . 462 44 0.2548 0.2744 0.02012 0.9286 0.3584
>>> d.tail(3)
CHR SNP BP NMISS BETA SE R2 T P
418704 12 . 19345588 44 -0.2207 0.2558 0.01743 -0.8631 0.393
418705 12 . 19345598 44 -0.2207 0.2558 0.01743 -0.8631 0.393
418706 12 . 19345611 44 -0.2207 0.2558 0.01743 -0.8631 0.393
>>> d.describe()
CHR BP NMISS BETA SE \
count 418707.000000 4.187070e+05 418707.0 4.186820e+05 418682.00000
mean 5.805738 1.442822e+07 44.0 -4.271777e-03 0.21433
std 3.392930 8.933882e+06 0.0 2.330019e-01 0.05190
min 1.000000 4.100000e+02 44.0 -1.610000e+00 0.10130
25% 3.000000 7.345860e+06 44.0 -1.638000e-01 0.17320
50% 5.000000 1.371612e+07 44.0 -1.826000e-16 0.20670
75% 9.000000 2.051322e+07 44.0 1.391000e-01 0.25010
max 12.000000 4.238896e+07 44.0 1.467000e+00 0.67580
R2 T P
count 418682.000000 4.186820e+05 4.186820e+05
mean 0.026268 -1.910774e-02 4.772397e-01
std 0.035903 1.095115e+00 2.944290e-01
min 0.000000 -5.582000e+00 2.034000e-08
25% 0.002969 -7.955000e-01 2.179000e-01
50% 0.012930 -8.468000e-16 4.624000e-01
75% 0.035910 6.712000e-01 7.254000e-01
max 0.531200 6.898000e+00 1.000000e+00
>>> d.sort_values(by="P").iloc[0:15]
CHR SNP BP NMISS BETA SE R2 T P
42870 1 . 32316680 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
29301 1 . 22184568 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
29302 1 . 22184590 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
29306 1 . 22184654 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
29305 1 . 22184628 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
29304 1 . 22184624 44 1.1870 0.1721 0.5312 6.898 2.034000e-08
112212 3 . 14365699 44 1.4670 0.2255 0.5018 6.504 7.490000e-08
29254 1 . 22167448 44 1.0780 0.1723 0.4822 6.254 1.713000e-07
69291 2 . 9480651 44 1.1140 0.1829 0.4690 6.091 2.939000e-07
29299 1 . 22180991 44 0.8527 0.1458 0.4488 5.848 6.574000e-07
101391 3 . 6959715 44 0.6782 0.1166 0.4462 5.817 7.285000e-07
29333 1 . 22198267 44 0.9252 0.1616 0.4383 5.724 9.888000e-07
195513 5 . 20178388 44 1.0350 0.1817 0.4359 5.697 1.082000e-06
29295 1 . 22180901 44 0.7469 0.1320 0.4324 5.657 1.236000e-06
29300 1 . 22181119 44 0.7469 0.1320 0.4324 5.657 1.236000e-06
>>> sort_D = d.sort_values(by="P").iloc[0:5]
>>> m_D = d.dropna() #remove NA
>>> sort_C = d.sort_values(["P","CHR", "BP"])
>>> sort_C.to_csv(file_name, sep='\t', encoding='utf-8')
>>> d.sort_values(by="C", ascending=True)
>>> sort_D.to_csv("result.txt", sep= " ")
>>> sort_D.to_csv("result_no_index.txt", sep= " ", index=False)
>>>
参考:
for m, i in enumerate(list(range(1,10))): for n, j in enumerate(list(range(m+1,10))): print i * j
http://*.com/questions/25943208/using-pandas-read-csv-on-an-open-file-twice
https://github.com/lijin-THU/notes-python
本文出自 “R和Python应用” 博客,请务必保留此出处http://matrix6ro.blog.51cto.com/1746429/1891793