Python:如何使用二进制值编码DNA序列?

时间:2021-11-15 18:44:51

I would like to convert a file that contained few DNA sequences into binary values which is as follow:

我想将包含少量DNA序列的文件转换为二进制值,如下所示:

A=1000
C=0100
G=0010
T=0001

FileA.txt

FileA.txt

CCGAT
GCTTA

Desired output

期望的输出

01000100001010000001
00100100000100011000

I have tried using this code to solve my problem but the bin output file seem failed to output my desired answer. Can anyone help me?

我已经尝试使用此代码来解决我的问题,但bin输出文件似乎无法输出我想要的答案。谁能帮我?

Code

import sys

if len(sys.argv) != 2 :
  sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
  sys.exit()

# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
  sequence += line.strip().upper()

sequence = sequence.replace('A', chr(0b1000))
sequence = sequence.replace('C', chr(0b0100))
sequence = sequence.replace('G', chr(0b0010))
sequence = sequence.replace('T', chr(0b0001))

outfile = open(sys.argv[1] + '.bin', 'wb')

outfile.write(bytearray(sequence, encoding = 'utf-8'))

2 个解决方案

#1


1  

Do you want ascii output or binary? The below will give you what you show in your post (though on a single line. Code needs to be modified to keep newlines).

你想要ascii输出还是二进制?下面将为您提供您在帖子中显示的内容(尽管在一行中。代码需要修改以保留换行符)。

import sys

if len(sys.argv) != 2 :
  sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
  sys.exit()

# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
  sequence += line.strip().upper()

sequence = sequence.replace('A', '1000')
sequence = sequence.replace('C', '0100')
sequence = sequence.replace('G', '0010')
sequence = sequence.replace('T', '0001')

outfile = open(sys.argv[1] + '.bin', 'wb')

outfile.write(sequence)

EDIT This creates a binary file where each nucleotide is a byte and the newlines are preserved in binary format.

编辑这会创建一个二进制文件,其中每个核苷酸都是一个字节,换行符以二进制格式保存。

import sys

if len(sys.argv) != 2 :
  sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
  sys.exit()

# assumes the file only contains dna and newlines
newbytearray=bytearray(b'',encoding='utf-8')
dict={'A':0b1000,'C':0b0100,'G':0b0010,'T':0b0001,'\n':0b1010}
with open(sys.argv[1]) as file:
    while True:
        char=file.read(1)
        if not char:
            file.close()
            break
        newbytearray.append(dict[char])
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(newbytearray)
outfile.close()

#Converts the binary file to unicode and prints the result sequence.
testBin = open('fileA.txt.bin','rb')
sequence=''
for line in testBin:
    line = line.replace(chr(0b1000),'1000')
    line = line.replace(chr(0b0100),'0100')
    line = line.replace(chr(0b0010),'0010')
    line = line.replace(chr(0b0001),'0001')
    line = line.replace(chr(0b1010),'\n')
    sequence += line
#outputVerify = open('outputVerify.txt','wb')
#outputVerify.write(sequence)
#outputVerify.close()
print sequence
testBin.close()

#Shows the data of the binary file. Note that byte 6 is the newline character 0b1010.
testBin = open('fileA.txt.bin','rb')
list = ''
i=0
while True:
    b = testBin.read(1)
    i += 1
    if not b:
    break #due to eof
    list += b
    print 'byte: ' + str(i) + ' is '+ '{0:04b}'.format(ord(b)) +' and has decimal representation: ' + str(ord(b))
testBin.close()

#2


3  

import re

d = {'A' :'1000','C' : '0100','G':'0010','T': '0001'}

patterns = ['CCGAT' ,'GCTTA']

for p in patterns:
    for c in p:
        p = re.sub(c,d[c],p)
    print(p)

#1


1  

Do you want ascii output or binary? The below will give you what you show in your post (though on a single line. Code needs to be modified to keep newlines).

你想要ascii输出还是二进制?下面将为您提供您在帖子中显示的内容(尽管在一行中。代码需要修改以保留换行符)。

import sys

if len(sys.argv) != 2 :
  sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
  sys.exit()

# assumes the file only contains dna and newlines
sequence = ''
for line in open(sys.argv[1]) :
  sequence += line.strip().upper()

sequence = sequence.replace('A', '1000')
sequence = sequence.replace('C', '0100')
sequence = sequence.replace('G', '0010')
sequence = sequence.replace('T', '0001')

outfile = open(sys.argv[1] + '.bin', 'wb')

outfile.write(sequence)

EDIT This creates a binary file where each nucleotide is a byte and the newlines are preserved in binary format.

编辑这会创建一个二进制文件,其中每个核苷酸都是一个字节,换行符以二进制格式保存。

import sys

if len(sys.argv) != 2 :
  sys.stderr.write('Usage: {} <nucleotide file>\n'.format(sys.argv[0]))
  sys.exit()

# assumes the file only contains dna and newlines
newbytearray=bytearray(b'',encoding='utf-8')
dict={'A':0b1000,'C':0b0100,'G':0b0010,'T':0b0001,'\n':0b1010}
with open(sys.argv[1]) as file:
    while True:
        char=file.read(1)
        if not char:
            file.close()
            break
        newbytearray.append(dict[char])
outfile = open(sys.argv[1] + '.bin', 'wb')
outfile.write(newbytearray)
outfile.close()

#Converts the binary file to unicode and prints the result sequence.
testBin = open('fileA.txt.bin','rb')
sequence=''
for line in testBin:
    line = line.replace(chr(0b1000),'1000')
    line = line.replace(chr(0b0100),'0100')
    line = line.replace(chr(0b0010),'0010')
    line = line.replace(chr(0b0001),'0001')
    line = line.replace(chr(0b1010),'\n')
    sequence += line
#outputVerify = open('outputVerify.txt','wb')
#outputVerify.write(sequence)
#outputVerify.close()
print sequence
testBin.close()

#Shows the data of the binary file. Note that byte 6 is the newline character 0b1010.
testBin = open('fileA.txt.bin','rb')
list = ''
i=0
while True:
    b = testBin.read(1)
    i += 1
    if not b:
    break #due to eof
    list += b
    print 'byte: ' + str(i) + ' is '+ '{0:04b}'.format(ord(b)) +' and has decimal representation: ' + str(ord(b))
testBin.close()

#2


3  

import re

d = {'A' :'1000','C' : '0100','G':'0010','T': '0001'}

patterns = ['CCGAT' ,'GCTTA']

for p in patterns:
    for c in p:
        p = re.sub(c,d[c],p)
    print(p)