如何修复R中的寡核苷酸频率误差

时间:2022-11-07 19:07:48

I have a problem with the read.fasta function of package "seqinr". When I use it with a lapply, it doesn't create the desired vector.

我的包“seqinr”的read.fasta函数有问题。当我使用lapply时,它不会创建所需的向量。

Also, when I use the function count on a vector built manually, the results are a table of zeros.

此外,当我在手动构建的向量上使用函数计数时,结果是一个零表。

This is my code:

这是我的代码:

library("seqinr")
library(MASS)

#GETTING THE FILES AFTER FRAGMENTS OF 500
files <- list.files(path="/Users/CamilaMV/Desktop/TESIS/",       pattern=".fna500mer..split", full.names=T, recursive=FALSE)

files

# SOLO ESTA TOMANDO EL PRIMER ARCHIVO

#READING THE DIFFERENT FASTA FILES
ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })


seqs<-list()
for(i in seq_along(ncrna))
{
  seqs[i]<-list(ncrna[[i]])
}

len1<-length(seqs[[1]])

frags1<-list()
for(j in 1:len1)
{
  frags1[j]<-list(seqs[[1]][[j]])
}

frags1

#COUNTING TRETRANUCLEOTIDES FOR EACH FRAGMENT
tetra_frag1<-list()

# seq_along(frags1)

#frags1[[1]]

for(l in seq_along(frags1))
{
  #tetra[i]<-list(count(ncra[[i]],4))
  tetra_frag1[l]<-oligonucleotideFrequency(frags1[[l]],4)  
}

When I did it before, the count function worked but it doesn't work properly anymore.

当我之前完成它时,计数功能起作用,但它不再正常工作。

Then, I decided to use oligonucletideFrequency function but it gives me the following error:

然后,我决定使用oligucletideFrequency函数,但它给我以下错误:

Error in (function (classes, fdef, mtable) : unable to find an inherited method for function ‘oligonucleotideFrequency’ for signature ‘"character"’

(函数(classes,fdef,mtable)中的错误:无法为签名'“character”'找到函数'oligonucleotideFrequency'的继承方法

But when I used is.character(frags1[[1]]) as a test, the result is true.

但是当我使用is.character(frags1 [[1]])作为测试时,结果为真。

I want to get a matrix that have oligonucletide frequencies to perform a PCA.

我想得到一个具有寡核苷酸频率的矩阵来执行PCA。

I want a final table where the columns are the 256 combinations of tetranucleotides and the rows are the names of the fragments (e.g. frag1, frag2,...) like the following:

我想要一个最终表,其中列是四核苷酸的256种组合,行是片段的名称(例如frag1,frag2,......),如下所示:

aaaa aaac ... f1 3 5 f2 4 6 f3 5 7 ...

aaaa aaac ... f1 3 5 f2 4 6 f3 5 7 ...

I will apreciate the help.

我会帮助你。

1 个解决方案

#1


1  

I could resolve the first problem and others. Finally, I have a R script with 4 functions that result in a list of RGB vectors.

我可以解决第一个问题和其他问题。最后,我有一个带有4个函数的R脚本,它们生成一个RGB向量列表。

 # GETTING LIBRARIES

library("seqinr")
library("ade4")
library("Biostrings")


## funcion 1

Processing_fragments<-function(PATH_FILES){

  #GETTING THE FILES AFTER FRAGMENTS OF 500
  files <- list.files(path=PATH_FILES, pattern=".fna500mer", full.names=T, recursive=FALSE)

  #GETTING THE FILES READING AS FASTA
  ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })


  fragmentsGeno1<-list()
  for(k in seq_along(ncrna[1]))
  {
    for(l in 1:10484)
    {
      fragmentsGeno1[l]<-ncrna[[k]][[l]]

    }
  }

  fragmentsGeno2<-list()
  for(k in seq_along(ncrna[2]))
  {
    for(l in 1:length(ncrna[[2]]))
    {
      fragmentsGeno2[l]<-ncrna[[k]][[l]]

    }
  }

  #GETTING ALL FRAGMENTS

  allFragments<-c(fragmentsGeno1,fragmentsGeno2)

  return(allFragments)

}


## funcion 2

Getting_frequency_account<-function(allFragments,kmer){

  #CONVERTING LOS FRAGMENTOS DE CADA FILE A OBJETOS DE DNAString

  DNA_String_Set_list_ALL<-list()

  for(i in seq_along(allFragments))
  {
    DNA_String_Set_list_ALL[i]<-DNAStringSet(allFragments[[i]])
  }

  # counting oligonucleotide
  countGenome1_Tetra<-lapply(DNA_String_Set_list_ALL,function(x) {oligonucleotideFrequency((x),kmer, as.prob = T) })

  # MATRIX FOR THE PCA

  #names columns
  col_names<-dimnames(countGenome1_Tetra[[1]])
  col_names<-col_names[[2]]

  #names rows
  frag_names<-c(paste("frag",c(1:length(allFragments)),sep=""))

  #matrix for PCA
  matrix_PCA<-matrix(unlist(countGenome1_Tetra),nrow = length(allFragments),ncol=256,byrow = T,dimnames=list(frag_names,col_names))

  return(matrix_PCA)

}


# View(matrix_PCA)


## funcion 3

Getting_first_three_components<-function(matrix_PCA){

  ######## PCA with prcomp#########

  prcomp_All<-prcomp(matrix_PCA)

  #obtaing the sum of varianza of the first three components

  Var<-prcomp_All$sdev^2 / sum(prcomp_All$sdev^2)

  Varianza_3_first_comp<-Var[1:3]

  Varianza_3_first_comp_Porcent<-Varianza_3_first_comp*100

  Suma_total<-sum(Varianza_3_first_comp_Porcent)

  ## obteniendo eigen of first three components 

  loadings_prcomp<-prcomp_All$x

  #dim(loadings_prcomp)

  First_three_components<-loadings_prcomp[,c(1,2,3)]

  return(First_three_components)

}

#funcion 4

Generating_hex_color_codes<-function(First_three_components){

  # getting min and max
  min<-min(First_three_components)
  max<-max(First_three_components)

  # getting ranges
  range_2_color<-c(min,max)
  range_RGB_color<-c(0,1)

  #making linear regression
  lm.out<-lm(range_RGB_color~range_2_color)

  #getting slope and intercept
  slope<-lm.out$coefficients[2]
  intercept<-lm.out$coefficients[1]

  #normalizing pca results to RGB
  new_Matriz<-(First_three_components*slope)+intercept

  new_Matriz<-as.matrix(new_Matriz)

  #using funcion rgb to generate matrix of hex color code

  #hex_Color_Matriz<-t(mapply(rgb, split(new_Matriz[,1], new_Matriz[,2],new_Matriz[,3],maxColorValue=255)))

  hex_Color_Vector<-vector()

  # list de cada r,g,b de cada fragmento

  rgb_List_Each_Fragment<-list()

  row_Final<-length(new_Matriz[,1])

  columns_Final<-length(new_Matriz[1,])

  for(i in 1:row_Final){

    for(j in 1:columns_Final){

      red<-new_Matriz[i,1]
      green<-new_Matriz[i,2]
      blue<-new_Matriz[i,3]

      hex_Color_Vector[i]<-rgb(red,green,blue,maxColorValue = 1)

      rgb_List_Each_Fragment[i]<-list(c(red,green,blue))

    }

  }

  return(rgb_List_Each_Fragment)

}

# Calling all the funcionts in order

allFragments<-Processing_fragments("/Users/CamilaMV/Desktop/TESIS")

matrix_PCA<-Getting_frequency_account(allFragments,4)

First_three_components<-Getting_first_three_components(matrix_PCA)

Hex_color_list<-Generating_hex_color_codes(First_three_components)

#1


1  

I could resolve the first problem and others. Finally, I have a R script with 4 functions that result in a list of RGB vectors.

我可以解决第一个问题和其他问题。最后,我有一个带有4个函数的R脚本,它们生成一个RGB向量列表。

 # GETTING LIBRARIES

library("seqinr")
library("ade4")
library("Biostrings")


## funcion 1

Processing_fragments<-function(PATH_FILES){

  #GETTING THE FILES AFTER FRAGMENTS OF 500
  files <- list.files(path=PATH_FILES, pattern=".fna500mer", full.names=T, recursive=FALSE)

  #GETTING THE FILES READING AS FASTA
  ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })


  fragmentsGeno1<-list()
  for(k in seq_along(ncrna[1]))
  {
    for(l in 1:10484)
    {
      fragmentsGeno1[l]<-ncrna[[k]][[l]]

    }
  }

  fragmentsGeno2<-list()
  for(k in seq_along(ncrna[2]))
  {
    for(l in 1:length(ncrna[[2]]))
    {
      fragmentsGeno2[l]<-ncrna[[k]][[l]]

    }
  }

  #GETTING ALL FRAGMENTS

  allFragments<-c(fragmentsGeno1,fragmentsGeno2)

  return(allFragments)

}


## funcion 2

Getting_frequency_account<-function(allFragments,kmer){

  #CONVERTING LOS FRAGMENTOS DE CADA FILE A OBJETOS DE DNAString

  DNA_String_Set_list_ALL<-list()

  for(i in seq_along(allFragments))
  {
    DNA_String_Set_list_ALL[i]<-DNAStringSet(allFragments[[i]])
  }

  # counting oligonucleotide
  countGenome1_Tetra<-lapply(DNA_String_Set_list_ALL,function(x) {oligonucleotideFrequency((x),kmer, as.prob = T) })

  # MATRIX FOR THE PCA

  #names columns
  col_names<-dimnames(countGenome1_Tetra[[1]])
  col_names<-col_names[[2]]

  #names rows
  frag_names<-c(paste("frag",c(1:length(allFragments)),sep=""))

  #matrix for PCA
  matrix_PCA<-matrix(unlist(countGenome1_Tetra),nrow = length(allFragments),ncol=256,byrow = T,dimnames=list(frag_names,col_names))

  return(matrix_PCA)

}


# View(matrix_PCA)


## funcion 3

Getting_first_three_components<-function(matrix_PCA){

  ######## PCA with prcomp#########

  prcomp_All<-prcomp(matrix_PCA)

  #obtaing the sum of varianza of the first three components

  Var<-prcomp_All$sdev^2 / sum(prcomp_All$sdev^2)

  Varianza_3_first_comp<-Var[1:3]

  Varianza_3_first_comp_Porcent<-Varianza_3_first_comp*100

  Suma_total<-sum(Varianza_3_first_comp_Porcent)

  ## obteniendo eigen of first three components 

  loadings_prcomp<-prcomp_All$x

  #dim(loadings_prcomp)

  First_three_components<-loadings_prcomp[,c(1,2,3)]

  return(First_three_components)

}

#funcion 4

Generating_hex_color_codes<-function(First_three_components){

  # getting min and max
  min<-min(First_three_components)
  max<-max(First_three_components)

  # getting ranges
  range_2_color<-c(min,max)
  range_RGB_color<-c(0,1)

  #making linear regression
  lm.out<-lm(range_RGB_color~range_2_color)

  #getting slope and intercept
  slope<-lm.out$coefficients[2]
  intercept<-lm.out$coefficients[1]

  #normalizing pca results to RGB
  new_Matriz<-(First_three_components*slope)+intercept

  new_Matriz<-as.matrix(new_Matriz)

  #using funcion rgb to generate matrix of hex color code

  #hex_Color_Matriz<-t(mapply(rgb, split(new_Matriz[,1], new_Matriz[,2],new_Matriz[,3],maxColorValue=255)))

  hex_Color_Vector<-vector()

  # list de cada r,g,b de cada fragmento

  rgb_List_Each_Fragment<-list()

  row_Final<-length(new_Matriz[,1])

  columns_Final<-length(new_Matriz[1,])

  for(i in 1:row_Final){

    for(j in 1:columns_Final){

      red<-new_Matriz[i,1]
      green<-new_Matriz[i,2]
      blue<-new_Matriz[i,3]

      hex_Color_Vector[i]<-rgb(red,green,blue,maxColorValue = 1)

      rgb_List_Each_Fragment[i]<-list(c(red,green,blue))

    }

  }

  return(rgb_List_Each_Fragment)

}

# Calling all the funcionts in order

allFragments<-Processing_fragments("/Users/CamilaMV/Desktop/TESIS")

matrix_PCA<-Getting_frequency_account(allFragments,4)

First_three_components<-Getting_first_three_components(matrix_PCA)

Hex_color_list<-Generating_hex_color_codes(First_three_components)