转录组数据的基因表达变化情况探索

一般来说可以用CV或者MAD来衡量某基因在某些样本的表达变化情况。

标准差与平均数的比值称为变异系数，记为C.V(Coefficient of Variance)。 变异系数又称“标准差率”，是衡量资料中各观测值变异程度的另一个统计量。当进行两个或多个资料变异程度的比较时，如果度量单位与平均数相同，可以直接利用标准差来比较。平均绝对误差（Mean Absolute Deviation），又叫平均绝对离差，它是是所有单个观测值与算术平均值的偏差的绝对值的平均。

用下面的代码可以看看，标准差，平均数，变异系数，平均绝对误差的关系，出图如下：

代码如下:

 1library(airway)
 2library(edgeR)
 3library(DESeq2)
 4
 5data(airway)
 6airway
 7exprSet=assay(airway) 
 8geneLists=rownames(exprSet)
 9keepGene=rowSums(cpm(exprSet)>0) >=2
10table(keepGene);dim(exprSet)
11dim(exprSet[keepGene,])
12exprSet=exprSet[keepGene,]
13rownames(exprSet)=geneLists[keepGene]
14
15boxplot(exprSet,las=2)
16# CPM normalized counts.
17exprSet=log2(cpm(exprSet)+1)
18boxplot(exprSet,las=2)
19
20mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
21sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE)
22mad_perl_gene <-   apply(exprSet, 1, mad, na.rm = TRUE)
23cv_per_gene <- data.frame(mean = mean_per_gene,
24                          sd = sd_per_gene,
25                          mad=mad_perl_gene,
26                          cv = sd_per_gene/mean_per_gene)
27rownames(cv_per_gene) <- rownames(exprSet)
28pairs(cv_per_gene)

很明显，这个CV可以衡量某基因的表达变化情况，但是没办法在基因与基因之间比较，因为不同基因的CV不同，大部分情况是因为它们的平均表达量不同而已。

根据表达量对CV值进行校正

 1# https://jdblischak.github.io/singleCellSeq/analysis/cv-adjusted-wo-19098-r2.html
 2library(zoo)
 3# Compute a data-wide coefficient of variation on CPM normalized counts.
 4cv <- apply(2^exprSet, 1, sd)/apply(2^exprSet, 1, mean)
 5# Order of genes by mean expression levels
 6order_gene <- order(apply(2^exprSet, 1, mean))
 7# Rolling medians of log10 squared CV by mean expression levels
 8roll_medians <- rollapply(log10(cv^2)[order_gene], width = 50, by = 25,
 9                          FUN = median, fill = list("extend", "extend", "NA") )
10## then change the NA values in the roll_medians
11table(is.na(roll_medians))
12ii_na <- which( is.na(roll_medians) )
13roll_medians[ii_na] <- median( log10(cv^2)[order_gene][ii_na] )
14names(roll_medians) <- rownames(exprSet)[order_gene]
15
16
17# re-order rolling medians according to the expression matrix 
18roll_medians <- roll_medians[  match(rownames(exprSet), names(roll_medians) ) ]
19stopifnot( all.equal(names(roll_medians), rownames(exprSet) ) )
20
21
22# adjusted coefficient of variation on log10 scale
23log10cv2_adj <-  log10( cv^2) - roll_medians
24plot(log10cv2_adj,mean_per_gene)
25#install.packages("basicTrendline")
26library(basicTrendline)
27trendline(log10cv2_adj,mean_per_gene,model="line2P")

出图如下：

可以看到这个校正后的cv已经是几乎不受基因表达量的影响了，所以可以比较不同基因的表达变化情况啦。

根据基因长度对CV进行校正

先去gencode数据库找到gtf文件，对每个基因计算外显子长度之和作为基因的长度，代码如下；

1## First, wecomputed gene lengths by taking the union of all exons within a gene based on the Ensembl annotation.
2
3#  cat ~/reference/gtf/gencode/gencode.v25.annotation.gtf|grep -v PAR_Y |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSGd+)/;print $1} }'|sort |uniq -c |grep -w 2
4# cat ~/reference/gtf/gencode/gencode.v25.annotation.gtf |grep -v PAR_Y |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSGd+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_t$c{$_}" foreach keys %c}' >>human_ENSG_length
5# grep ENSG00000237094  ~/reference/gtf/gencode/gencode.v25.annotation.gtf|grep -w exon |cut -f 4-5|sort -u |awk '{print $2-$1}'|paste -sd+ - | bc
6# grep ENSG00000237094 human_ENSG_length
7#  cat gencode.vM12.annotation.gtf |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSMUSGd+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_t$c{$_}" foreach keys %c}' >mouse_ENSG_length

得到的长度文件如下：

1               V1    V2
21 ENSG00000252040   131
32 ENSG00000251770    82
43 ENSG00000261028   856
54 ENSG00000186844   421
65 ENSG00000234241  1682
76 ENSG00000144815 15589

 1gen_l=read.table('human_ENSG_length',stringsAsFactors = F)
 2head(gen_l)
 3length_per_gene=gen_l[match(rownames(exprSet),gen_l[,1]),2]
 4mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
 5sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE)
 6mad_perl_gene <-   apply(exprSet, 1, mad, na.rm = TRUE)
 7cv_per_gene <- data.frame(mean = mean_per_gene,
 8                          sd = sd_per_gene,
 9                          mad=mad_perl_gene,
10                          len=log2(length_per_gene),
11                          cv = sd_per_gene/mean_per_gene)
12rownames(cv_per_gene) <- rownames(exprSet)
13cv_per_gene=na.omit(cv_per_gene)
14cor(cv_per_gene)
15pairs(cv_per_gene)
16
17fivenum(cv_per_gene$mean)
18## 假如去除低表达量基因
19cv_per_gene=cv_per_gene[cv_per_gene$mean < fivenum(cv_per_gene$mean)[2],]
20pairs(cv_per_gene)

出图如下：

可以看到基因长度的确是影响着CV值，而且并不独立于表达量，所以还是需要去除这个因素。

可以使用校正表达量的代码来校正长度：

 1library(zoo) 
 2table(rownames(exprSet) %in% gen_l[,1])
 3exprSet=exprSet[rownames(exprSet) %in% gen_l[,1],]
 4cv <- apply(2^exprSet, 1, sd)/apply(2^exprSet, 1, mean) 
 5## firstly for mean values of exprSet 
 6order_gene <- order(apply(2^exprSet, 1, mean)) 
 7roll_medians_mean <- rollapply(log10(cv^2)[order_gene], width = 50, by = 25,
 8                          FUN = median, fill = list("extend", "extend", "NA") )
 9## then change the NA values in the roll_medians_mean
10table(is.na(roll_medians_mean))
11ii_na <- which( is.na(roll_medians_mean) )
12roll_medians_mean[ii_na] <- median( log10(cv^2)[order_gene][ii_na] )
13names(roll_medians_mean) <- rownames(exprSet)[order_gene] 
14roll_medians_mean <- roll_medians_mean[  match(rownames(exprSet), names(roll_medians_mean) ) ]
15stopifnot( all.equal(names(roll_medians_mean), rownames(exprSet) ) ) 
16log10cv2_adj <-  log10( cv^2) - roll_medians_mean
17
18mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
19plot( log10( cv^2) ,mean_per_gene)
20plot(log10cv2_adj,mean_per_gene)
21length_per_gene=gen_l[match(rownames(exprSet),gen_l[,1]),2]
22
23## Then for gene length(log10)
24order_gene <- order( log10(length_per_gene) ) 
25cv=log10cv2_adj
26roll_medians_length <- rollapply(cv[order_gene], width = 50, by = 25,
27                               FUN = median, fill = list("extend", "extend", "NA") )
28## then change the NA values in the roll_medians_length
29table(is.na(roll_medians_length))
30ii_na <- which( is.na(roll_medians_length) )
31roll_medians_length[ii_na] <- median( cv[order_gene][ii_na] )
32names(roll_medians_length) <- rownames(exprSet)[order_gene] 
33roll_medians_length <- roll_medians_length[  match(rownames(exprSet), names(roll_medians_length) ) ]
34stopifnot( all.equal(names(roll_medians_length), rownames(exprSet) ) ) 
35log10cv2_adj <-  cv -  roll_medians_length
36plot( log10( cv^2) ,log10(length_per_gene))
37plot(log10cv2_adj,log10(length_per_gene)) 
38plot(log10cv2_adj,log10(mean_per_gene)) 
39
40#install.packages("basicTrendline")
41library(basicTrendline)
42par(mfrow=c(1,2))
43trendline(log10(length_per_gene),log10cv2_adj,model="line2P") 
44trendline(mean_per_gene,log10cv2_adj,model="line2P")

完全去除后，校正如下：

可以看到跟文章里面的非常接近了，校正两次后的CV值，就是 DM值

这个计算公式参考： https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4595712/#mmc1

你懂的，支持我继续创作，赞赏请留言，让我对你有个印象，也许将来会见面呢