I would like to convert a file that contained few DNA sequences into binary values which is as follow:
我想将包含少量DNA序列的文件转换为二进制值,如下所示:
A=1000
C=0100
G=0010
T=0001
FileA.txt
FileA.txt
CCGAT
GCTTA
Desired output
期望的输出
01000100001010000001
00100100000100011000
I have tried using this code to solve my problem but the bin output file seem failed to output my desired answer. Can anyone help me?
我已经尝试使用此代码来解决我的问题,但bin输出文件似乎无法输出我想要的答案。谁能帮我?
Code
码
import sys
if len(sys.argv) != 2 :
sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
sys.exit()
# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
sequence += line.strip().upper()
sequence = sequence.replace('A', chr(0b1000))
sequence = sequence.replace('C', chr(0b0100))
sequence = sequence.replace('G', chr(0b0010))
sequence = sequence.replace('T', chr(0b0001))
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(bytearray(sequence, encoding = 'utf-8'))
2 个解决方案
#1
1
Do you want ascii output or binary? The below will give you what you show in your post (though on a single line. Code needs to be modified to keep newlines).
你想要ascii输出还是二进制?下面将为您提供您在帖子中显示的内容(尽管在一行中。代码需要修改以保留换行符)。
import sys
if len(sys.argv) != 2 :
sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
sys.exit()
# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
sequence += line.strip().upper()
sequence = sequence.replace('A', '1000')
sequence = sequence.replace('C', '0100')
sequence = sequence.replace('G', '0010')
sequence = sequence.replace('T', '0001')
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(sequence)
EDIT This creates a binary file where each nucleotide is a byte and the newlines are preserved in binary format.
编辑这会创建一个二进制文件,其中每个核苷酸都是一个字节,换行符以二进制格式保存。
import sys
if len(sys.argv) != 2 :
sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
sys.exit()
# assumes the file only contains dna and newlines
newbytearray=bytearray(b'',encoding='utf-8')
dict={'A':0b1000,'C':0b0100,'G':0b0010,'T':0b0001,'\n':0b1010}
with open(sys.argv[1]) as file:
while True:
char=file.read(1)
if not char:
file.close()
break
newbytearray.append(dict[char])
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(newbytearray)
outfile.close()
#Converts the binary file to unicode and prints the result sequence.
testBin = open('fileA.txt.bin','rb')
sequence=''
for line in testBin:
line = line.replace(chr(0b1000),'1000')
line = line.replace(chr(0b0100),'0100')
line = line.replace(chr(0b0010),'0010')
line = line.replace(chr(0b0001),'0001')
line = line.replace(chr(0b1010),'\n')
sequence += line
#outputVerify = open('outputVerify.txt','wb')
#outputVerify.write(sequence)
#outputVerify.close()
print sequence
testBin.close()
#Shows the data of the binary file. Note that byte 6 is the newline character 0b1010.
testBin = open('fileA.txt.bin','rb')
list = ''
i=0
while True:
b = testBin.read(1)
i += 1
if not b:
break #due to eof
list += b
print 'byte: ' + str(i) + ' is '+ '{0:04b}'.format(ord(b)) +' and has decimal representation: ' + str(ord(b))
testBin.close()
#2
3
import re
d = {'A' :'1000','C' : '0100','G':'0010','T': '0001'}
patterns = ['CCGAT' ,'GCTTA']
for p in patterns:
for c in p:
p = re.sub(c,d[c],p)
print(p)
#1
1
Do you want ascii output or binary? The below will give you what you show in your post (though on a single line. Code needs to be modified to keep newlines).
你想要ascii输出还是二进制?下面将为您提供您在帖子中显示的内容(尽管在一行中。代码需要修改以保留换行符)。
import sys
if len(sys.argv) != 2 :
sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
sys.exit()
# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
sequence += line.strip().upper()
sequence = sequence.replace('A', '1000')
sequence = sequence.replace('C', '0100')
sequence = sequence.replace('G', '0010')
sequence = sequence.replace('T', '0001')
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(sequence)
EDIT This creates a binary file where each nucleotide is a byte and the newlines are preserved in binary format.
编辑这会创建一个二进制文件,其中每个核苷酸都是一个字节,换行符以二进制格式保存。
import sys
if len(sys.argv) != 2 :
sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
sys.exit()
# assumes the file only contains dna and newlines
newbytearray=bytearray(b'',encoding='utf-8')
dict={'A':0b1000,'C':0b0100,'G':0b0010,'T':0b0001,'\n':0b1010}
with open(sys.argv[1]) as file:
while True:
char=file.read(1)
if not char:
file.close()
break
newbytearray.append(dict[char])
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(newbytearray)
outfile.close()
#Converts the binary file to unicode and prints the result sequence.
testBin = open('fileA.txt.bin','rb')
sequence=''
for line in testBin:
line = line.replace(chr(0b1000),'1000')
line = line.replace(chr(0b0100),'0100')
line = line.replace(chr(0b0010),'0010')
line = line.replace(chr(0b0001),'0001')
line = line.replace(chr(0b1010),'\n')
sequence += line
#outputVerify = open('outputVerify.txt','wb')
#outputVerify.write(sequence)
#outputVerify.close()
print sequence
testBin.close()
#Shows the data of the binary file. Note that byte 6 is the newline character 0b1010.
testBin = open('fileA.txt.bin','rb')
list = ''
i=0
while True:
b = testBin.read(1)
i += 1
if not b:
break #due to eof
list += b
print 'byte: ' + str(i) + ' is '+ '{0:04b}'.format(ord(b)) +' and has decimal representation: ' + str(ord(b))
testBin.close()
#2
3
import re
d = {'A' :'1000','C' : '0100','G':'0010','T': '0001'}
patterns = ['CCGAT' ,'GCTTA']
for p in patterns:
for c in p:
p = re.sub(c,d[c],p)
print(p)