数据集通常需要去掉文件中的重复行,以下是操作代码:
####将2个.csv拼接,并保存.csv文件
df1 = pd.read_csv(r'/Ce/NO.1/', header=None)#读取第一个文件
df2 = pd.read_csv(r'/Ce/NO.1/', header=None)#读取第二个文件
file1= [df1, df2]
outfile1 = (file1)#竖着拼接
outfile1.to_csv("/data/proteinall"+".csv", index=0, header=None, sep=',')
#####去掉重复行
inFile = open('/data/','r') #
outFile = open('/data/','w') #最后保存的.csv文件
listLines = []
for line in inFile:
if line in listLines:
continue
else:
(line)
(line)
()
()
以下是读取.txt,去掉重复行,保存为.csv
####加载.txt,去掉重复序列,保存为.csv
import numpy as np
import pandas as pd
import copy
def read_traingingData(file_name):
# read sample from a file
seq = []
with open(file_name, 'r') as fp:
i = 0
for line in fp:
(('\n')[0])
i = i+1
return seq
file_1 = '/Ce/Ce.AC_N1.txt'
protein_A = read_traingingData(file_1)
df_protein=(protein_A)
seq_protein = df_protein.drop_duplicates()
seq_protein.to_csv('/home/aita/4444/LX/Ziqi/',index=0,header= None,encoding='gbk')