蛋白质PDB文件读取C++代码

时间:2021-06-09 16:49:17

在生物信息学中,PDB数据库是非常重要的蛋白质结构数据库。PDB数据文件的读取是进行蛋白质结构相关预测的重要步骤!下面给出了简单的PDB文件读取的C++代码,以供参考:


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sstream>
#include <iostream>
#include <fstream>
#include <vector>
using namespace std;

class Residue {
public:
vector<double*> atomxyzs;
vector<string> atominfos; // each element for example : "ATOM 1 N GLU 98 "
int caind;
int cbind;
int res_pos;
int orig_res_pos;

Residue(vector<double*> _atomxyzs, vector<string> _atominfos, int _caind, int _cbind, int _res_pos, int _orig_res_pos){
atomxyzs = _atomxyzs;
atominfos = _atominfos;
caind = _caind;
cbind = _cbind;
res_pos = _res_pos;
orig_res_pos = _orig_res_pos;
}
};

class Protein{
private:
string pdbpath; // in
vector<Residue> residues; // out

public:
Protein(string _pdbpath){
pdbpath = _pdbpath;

readPDB();
}

int size(){
return residues.size();
}

Residue getIthResidue(int i){
return residues[i];
}

int getIthResidueOriginalIndexInProt(int i){
return residues[i].orig_res_pos;
}

double* getIthResidueCAXYZ(int i){
return residues[i].atomxyzs[residues[i].caind];
}

double* getIthResidueCBXYZ(int i){
return residues[i].atomxyzs[residues[i].cbind];
}

vector<Residue> getResidues(){
return residues;
}

void save(vector<Residue> _residues, string savepath){
ofstream fout(savepath);
for (int i = 0; i < _residues.size(); i++){
Residue res = _residues[i];
for (int j = 0 ; j < res.atominfos.size(); j++){
char temp[1000];

string atominfo = res.atominfos[j]; // "ATOM 1 N GLU 98 "
double* xyz = res.atomxyzs[j]; //" 66.677"
char xstr[9];
char ystr[9];
char zstr[9];
sprintf(xstr, "%8.3f", xyz[0]);
sprintf(ystr, "%8.3f", xyz[1]);
sprintf(zstr, "%8.3f", xyz[2]);
string useless = " 1.00 0.00";
fout << atominfo << xstr << ystr << zstr << useless << endl;
}
}

fout.close();
}

/*char* formatDouble(double v, int ch_num){
char tmp[64];
sprintf(tmp, "%8.3f %8.3f %8.3f\n", x[i][0], x[i][1], x[i][2]);
}*/

private:
void readPDB(){
vector<int> atomindex;
vector<string> atominfos;
vector<double*> points;
int ca_pos_in_res = 0;
int cb_pos_in_res = 0;

int pre_index = -9999999;
ifstream fin(pdbpath);
string line;
getline(fin, line);
string lastline(line);
bool isFirstATOM = true;
bool isContainCA = false;
int tmp_pos = 0;
int res_pos = 1;
int atom_pos = 1;
while (fin.good()){
if (line.compare(0, 3, "TER")==0)
break;
if (line.compare(0, 4, "ATOM")==0){
char cstr[50];
double x = 0.0;
strcpy(cstr, (line.substr(30, 8)).c_str());
sscanf(cstr, "%lf", &x);

double y = 0.0;
strcpy(cstr, (line.substr(38, 8)).c_str());
sscanf(cstr, "%lf", &y);

double z = 0.0;
strcpy(cstr, (line.substr(46, 8)).c_str());
sscanf(cstr, "%lf", &z);

double* xyz = new double[3];
xyz[0] = x;
xyz[1] = y;
xyz[2] = z;

string atominfo = line.substr(0, 30);

int now_index = -999999;
strcpy(cstr, (line.substr(22, 4)).c_str());
sscanf(cstr, "%d", &now_index);

if (line.compare(13, 3, "CA ")==0){
isContainCA = true;
ca_pos_in_res = tmp_pos;

if (isFirstATOM){
pre_index = now_index;
}
}

if (line.compare(13, 3, "CB ")==0){
cb_pos_in_res = tmp_pos;
}

if (now_index != pre_index){
if (isContainCA){
if (pre_index != -9999999){
Residue tmpResidue(points, atominfos, ca_pos_in_res, cb_pos_in_res, res_pos, pre_index);
residues.push_back(tmpResidue);
res_pos++;
}
points.clear();
points.push_back(xyz);
atominfos.clear();
atominfos.push_back(atominfo);

tmp_pos = 0;
isContainCA = false;
if (line.compare(13, 3, "CA ")==0){
isContainCA = true;
}
}else{
if (isFirstATOM){
points.push_back(xyz);
atominfos.push_back(atominfo);
}
}
pre_index = now_index;
}else{
points.push_back(xyz);
atominfos.push_back(atominfo);
}

isFirstATOM = false;

tmp_pos++;
}

lastline = line;
getline(fin, line);
}

if (isContainCA || lastline.length() >= 16 || lastline.compare(13, 3, "CA ")==0){
Residue tmpResidue(points, atominfos, ca_pos_in_res, cb_pos_in_res, res_pos, pre_index);
residues.push_back(tmpResidue);
}

fin.close();
}
};