lin_20240321_calculating_rG4score.R
################################################################################
################# Function used for the G4Hunter paper #########################
################################################################################
#### L. Lacroix, laurent.lacroix@inserm.fr , 20150928
################################################################################
###### G4translate change the DNA code into G4Hunter code.
###### Only G or C are taken into account. non G/C bases are translated in 0
###### It is OK if N or U are in the sequence
###### but might be a problem if other letters or numbers are present
###### lowercase ARE not welcome
G4translate <- function(x)
# x is a Rle of a sequence
{
xres=x
runValue(xres)[runValue(x)=='C' & runLength(x)>3] <- -4
runValue(xres)[runValue(x)=='C' & runLength(x)==3] <- -3
runValue(xres)[runValue(x)=='C' & runLength(x)==2] <- -2
runValue(xres)[runValue(x)=='C' & runLength(x)==1] <- -1
runValue(xres)[runValue(x)=='G' & runLength(x)>3] <- 4
runValue(xres)[runValue(x)=='G' & runLength(x)==3] <- 3
runValue(xres)[runValue(x)=='G' & runLength(x)==2] <- 2
runValue(xres)[runValue(x)=='G' & runLength(x)==1] <- 1
runValue(xres)[runValue(x)!='C' & runValue(x)!='G'] <- 0
Rle(as.numeric(xres))
}
################################################################################
################################################################################
###### G4Hscore return the G4Hscore of a sequence
###### The output is a number
G4Hscore <- function(y)
# y can be DNAString or a DNAStringSet or a simple char string.
{
require(S4Vectors)
y2 <- Rle(strsplit(as.character(y),NULL)[[1]])
y3 <- G4translate(y2)
mean(y3)
}
################################################################################
################################################################################
###### G4runmean return the G4Hscore in a window of 25 using runmean.
###### The output is a Rle of the runmeans
G4runmean <- function(y,k=25)
#y is a DNAString or a DNAStringSet, k is the window for the runmean
{
require(S4Vectors)
y2 <- Rle(strsplit(as.character(y),NULL)[[1]])
y3 <- G4translate(y2)
runmean(y3,k)
}
################################################################################
################################################################################
#### function to extract sequences with abs(G4Hscore)>=hl (threshold)
#### from a chromosome (i) of a genome (gen)
#### return a GRanges
#### use masked=5 for unmasked genome
#### need a genome to be in the memory in the genome variable
#### k is the window size for the runmean
#### i is the chromosome number
#### hl is the threshold
G4hunt <- function(i,k=25,hl=1.5,gen=genome,masked=5)
{
require(GenomicRanges)
chr <- gen[[i]]
if (masked==2) {chr=injectHardMask(chr)}
if (masked==3) {active(masks(chr))['RM']=T;chr=injectHardMask(chr)}
if (masked==0) {active(masks(chr))=F;chr=injectHardMask(chr)}
if (masked==4) {active(masks(chr))=T;chr=injectHardMask(chr)}
chr_G4hk <- G4runmean(chr,k)
tgenome <- G4translate(Rle(strsplit(as.character(chr),NULL)[[1]]))
if (class(gen)=="DNAStringSet")
{seqname <- names(gen)[i]
}else{
seqname <- seqnames(gen)[i]
}
j <- hl
chrCh <- Views(chr_G4hk, chr_G4hk<=(-j))
chrGh <- Views(chr_G4hk, chr_G4hk>=j)
IRC <- IRanges(start=start(chrCh),end=(end(chrCh)+k-1))
IRG <- IRanges(start=start(chrGh),end=(end(chrGh)+k-1))
nIRC <- reduce(IRC)
nIRG <- reduce(IRG)
# overlapping results on the same strand are fused
nchrCh <- Views(chr_G4hk,start=start(nIRC),end=(end(nIRC)-k+1))
nchrGh <- Views(chr_G4hk,start=start(nIRG),end=(end(nIRG)-k+1))
nscoreC <- signif(mean(Views(tgenome,nIRC)),3)
nscoreG <- signif(mean(Views(tgenome,nIRG)),3)
nstraC <- Rle(rep('-',length(nIRC)))
nstraG <- Rle(rep('+',length(nIRG)))
nhlC <- Rle(rep(j,length(nIRC)))
nhlG <- Rle(rep(j,length(nIRG)))
nkC <- Rle(rep(k,length(nIRC)))
nkG <- Rle(rep(k,length(nIRG)))
maskC <- Rle(rep(masked,length(nIRC)))
maskG <- Rle(rep(masked,length(nIRG)))
if (length(nIRC)==0)
{
nxC <- GRanges()
}else{
nxC <- GRanges(seqnames=Rle(seqname),
ranges=nIRC,
strand=nstraC,
score=nscoreC,hl=nhlC,k=nkC,mask=maskC)
}
if (length(nIRG)==0)
{
nxG <- GRanges()
}else{
nxG <- GRanges(seqnames=Rle(seqname),
ranges=nIRG,
strand=nstraG,
score=nscoreG,hl=nhlG,k=nkG,mask=maskG)
}
nx <- c(nxC,nxG)
return(nx)
}
################################################################################
################################################################################
##### like G4hunt but for several thresholds
##### warning, if the list of threshold is created with the seq function
##### rounding error can occur
##### return a GRangesList
G4huntlist <- function(i,k=25,hl=c(1,1.2,1.5,1.75,2),gen=genome,masked=5)
{
require(GenomicRanges)
nx <- list()
chr <- gen[[i]]
if (masked==2) {chr=injectHardMask(chr)}
if (masked==3) {active(masks(chr))['RM']=T;chr=injectHardMask(chr)}
if (masked==0) {active(masks(chr))=F;chr=injectHardMask(chr)}
if (masked==4) {active(masks(chr))=T;chr=injectHardMask(chr)}
chr_G4hk <- G4runmean(chr,k)
tgenome <- G4translate(Rle(strsplit(as.character(chr),NULL)[[1]]))
if (class(gen)=="DNAStringSet")
{seqname <- names(gen)[i]
}else{
seqname <- seqnames(gen)[i]
}
hl <- round(hl,2)
for (j in hl)
{
chrCh <- Views(chr_G4hk, chr_G4hk<=(-j))
chrGh <- Views(chr_G4hk, chr_G4hk>=j)
IRC <- IRanges(start=start(chrCh),end=(end(chrCh)+k-1))
IRG <- IRanges(start=start(chrGh),end=(end(chrGh)+k-1))
nIRC <- reduce(IRC)
nIRG <- reduce(IRG)
# overlapping results on the same strand are fused
nchrCh <- Views(chr_G4hk,start=start(nIRC),end=(end(nIRC)-k+1))
nchrGh <- Views(chr_G4hk,start=start(nIRG),end=(end(nIRG)-k+1))
nscoreC <- signif(mean(Views(tgenome,nIRC)),3)
nscoreG <- signif(mean(Views(tgenome,nIRG)),3)
nstraC <- Rle(rep('-',length(nIRC)))
nstraG <- Rle(rep('+',length(nIRG)))
nhlC <- Rle(rep(j,length(nIRC)))
nhlG <- Rle(rep(j,length(nIRG)))
nkC <- Rle(rep(k,length(nIRC)))
nkG <- Rle(rep(k,length(nIRG)))
maskC <- Rle(rep(masked,length(nIRC)))
maskG <- Rle(rep(masked,length(nIRG)))
if (length(nIRC)==0)
{
nxC <- GRanges()
}else{
nxC <- GRanges(seqnames=Rle(seqname),
ranges=nIRC,
strand=nstraC,
score=nscoreC,hl=nhlC,k=nkC,mask=maskC)
}
if (length(nIRG)==0)
{
nxG <- GRanges()
}else{
nxG <- GRanges(seqnames=Rle(seqname),
ranges=nIRG,
strand=nstraG,
score=nscoreG,hl=nhlG,k=nkG,mask=maskG)
}
nx[[which(hl==j)]]=c(nxC,nxG)
}
names(nx) <- as.character(hl)
nx <- GRangesList(nx)
return(nx)
}
################################################################################
################################################################################
##### function to refine G4hunt results
G4startrun <- function(y,chrom=chr,letter='C') #y is a START
{
if (y!=1)
{if (letter(chrom,y)==letter)
{
while (letter(chrom,y-1)==letter) {y <- y-1}
}
else
{
y <- y+1
while (letter(chrom,y)!=letter) {y <- y+1}
}
}
return(y)
}
G4endrun <- function(y,chrom=chr,letter='C') #y is a END
{
if (y!=length(chrom))
{if (letter(chrom,y)==letter)
{
while (letter(chrom,y+1)==letter) {y <- y+1}
}
else
{
y <- y-1
while (letter(chrom,y)!=letter) {y <- y-1}
}
}
return(y)
}
################################################################################
################################################################################
G4huntrefined <- function(Res_hl,gen=genome,i=25)
# take in input the result from the function G4hunt, a GRanges
# it also require a genome and chromosome number (i) to extract the seq
# and to build the GRanges
{
chr <- gen[[i]]
if (class(gen)=="DNAStringSet")
{seqname <- names(gen)[i]
}else{
seqname <- seqnames(gen)[i]
}
Reschr <- Res_hl[seqnames(Res_hl)==seqname]
tgenome <- G4translate(Rle(strsplit(as.character(chr),NULL)[[1]]))
### C treatment
ReschrC <- Reschr[which(score(Reschr)<0)]
IRC <- IRanges(start=start(ReschrC),end=end(ReschrC))
if (length(IRC)==0)
{
totC <- NULL
}else{
chr_transC <- Views(tgenome,IRC)
nnIRC <- IRC
start(nnIRC) <- sapply(start(IRC),G4startrun,letter='C',chrom=chr)
end(nnIRC) <- sapply(end(IRC),G4endrun,letter='C',chrom=chr)
nG4scoreC <- signif(mean(Views(tgenome,nnIRC)),3)
nnseqC <- as.character(Views(chr,nnIRC))
totC <- cbind.data.frame(start(nnIRC),end(nnIRC),nnseqC,nG4scoreC)
names(totC) <- c('newstart','newend','newseq','newG4h')
}
### G treatment
ReschrG <- Reschr[which(score(Reschr)>0)]
IRG=IRanges(start=start(ReschrG),end=end(ReschrG))
if (length(IRG)==0)
{
totG <- NULL
}else{
chr_transG <- Views(tgenome,IRG)
nnIRG <- IRG
start(nnIRG) <- sapply(start(IRG),G4startrun,letter='G',chrom=chr)
end(nnIRG) <- sapply(end(IRG),G4endrun,letter='G',chrom=chr)
nG4scoreG <- signif(mean(Views(tgenome,nnIRG)),3)
nnseqG <- as.character(Views(chr,nnIRG))
totG <- cbind.data.frame(start(nnIRG),end(nnIRG),nnseqG,nG4scoreG)
names(totG) <- c('newstart','newend','newseq','newG4h')
}
if (is.null(totC) & is.null(totG))
{
tot2 <- NULL
tot2GR <- NULL
}else{
tot <- rbind.data.frame(totC,totG)
tot2 <- tot[order(tot[,'newstart']),]
stra <- sign(tot2$newG4h