Now uses AC directly from eval, not via AF, internally for AC vs. X plotting. Requires at least 1 SNP to include a site in TiTv plotting or snp/indel ratio. Uses .byAC not .byAF eval file now
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6014 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
64196b6c7a
commit
907018768c
37
R/exomeQC.R
37
R/exomeQC.R
|
|
@ -29,8 +29,9 @@ if ( onCMDLine ) {
|
|||
highlightSamples = c()
|
||||
} else {
|
||||
ProjectName = "InDevelopmentInR"
|
||||
preQCFile <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
|
||||
VariantEvalRoot <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated"
|
||||
preQCFile <- NA # "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
|
||||
#VariantEvalRoot <- "qcTestData//ESPGO_Gabriel_NHLBI_eomi_june_2011_batch1"
|
||||
VariantEvalRoot <- "qcTestData/MC_Engle_11_Samples_06092011"
|
||||
outputPDF = "bar.pdf"
|
||||
highlightSamples = c() # parseHighlightSamples("29029,47243")
|
||||
}
|
||||
|
|
@ -52,15 +53,13 @@ expandVEReport <- function(d) {
|
|||
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
|
||||
# Metrics by sites:
|
||||
# bySite -> counts of SNPs and Indels by novelty, with expectations
|
||||
# byAF -> snps and indels (known / novel)
|
||||
# byAC -> snps and indels (known / novel)
|
||||
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
|
||||
byAF = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAF.eval", sep="")))
|
||||
r$byAF$CountVariants$nIndels = r$byAF$CountVariants$nInsertions + r$byAF$CountVariants$nDeletions
|
||||
|
||||
nChrom = 1 / min(r$byAF$TiTvVariantEvaluator$AlleleFrequency[r$byAF$TiTvVariantEvaluator$AlleleFrequency > 0])
|
||||
r$byAF$TiTvVariantEvaluator$nSNPs = r$byAF$TiTvVariantEvaluator$nTi + r$byAF$TiTvVariantEvaluator$nTv
|
||||
r$byAF$CountVariants$AC = r$byAF$CountVariants$AlleleFrequency * nChrom
|
||||
r$byAF$TiTvVariantEvaluator$AC = r$byAF$TiTvVariantEvaluator$AlleleFrequency * nChrom
|
||||
byAC = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAC.eval", sep="")))
|
||||
r$byAC$CountVariants$nIndels = r$byAC$CountVariants$nInsertions + r$byAC$CountVariants$nDeletions
|
||||
r$byAC$TiTvVariantEvaluator$nSNPs = r$byAC$TiTvVariantEvaluator$nTi + r$byAC$TiTvVariantEvaluator$nTv
|
||||
r$byAC$CountVariants$AC = r$byAC$CountVariants$AlleleCount
|
||||
r$byAC$TiTvVariantEvaluator$AC = r$byAC$TiTvVariantEvaluator$AlleleCount
|
||||
return(r)
|
||||
}
|
||||
|
||||
|
|
@ -77,7 +76,7 @@ summaryTable <- function(metricsBySites, metricsBySample) {
|
|||
|
||||
summaryPlots <- function(metricsBySites) {
|
||||
name = "SNP and Indel count by novelty and allele frequency"
|
||||
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
|
||||
molten = melt(subset(metricsBySites$byAC$CountVariants, Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
|
||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
|
||||
p <- p + opts(title = name)
|
||||
p <- p + scale_y_log10("Number of variants")
|
||||
|
|
@ -92,7 +91,7 @@ summaryPlots <- function(metricsBySites) {
|
|||
# Counts vs. Allele frequency
|
||||
name = "Variant counts by allele count"
|
||||
for ( measure in c("nSNPs", "nIndels")) {
|
||||
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
|
||||
molten = melt(subset(metricsBySites$byAC$CountVariants, AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
|
||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
|
||||
p <- p + opts(title = paste(name, ":", measure))
|
||||
p <- p + scale_y_log10("Number of variants")
|
||||
|
|
@ -104,8 +103,10 @@ summaryPlots <- function(metricsBySites) {
|
|||
}
|
||||
|
||||
name = "Transition / transversion ratio by allele count"
|
||||
byAFNoAll = subset(metricsBySites$byAF$TiTvVariantEvaluator, Novelty != "all" & FunctionalClass == "all" & AC > 0)
|
||||
p <- ggplot(data=byAFNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
|
||||
# nSNPs > 0 => requires that we have some data here, otherwise Ti/Tv is zero from VE
|
||||
minSNPsToInclude = 0
|
||||
byACNoAll = subset(metricsBySites$byAC$TiTvVariantEvaluator, Novelty != "all" & AC > 0 & nSNPs > minSNPsToInclude)
|
||||
p <- ggplot(data=byACNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
|
||||
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
|
||||
p <- p + opts(title = name)
|
||||
p <- p + geom_smooth(size=2)
|
||||
|
|
@ -117,9 +118,9 @@ summaryPlots <- function(metricsBySites) {
|
|||
|
||||
# SNPs to indels ratio by allele frequency
|
||||
name = "SNPs to indels ratio by allele frequency"
|
||||
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio = metricsBySites$byAF$CountVariants$nSNPs / metricsBySites$byAF$CountVariants$nIndels
|
||||
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio[metricsBySites$byAF$CountVariants$nIndels == 0] = NaN
|
||||
p <- ggplot(data=subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty == "all"), aes(x=AC, y=SNP.Indel.Ratio))
|
||||
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio = metricsBySites$byAC$CountVariants$nSNPs / metricsBySites$byAC$CountVariants$nIndels
|
||||
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio[metricsBySites$byAC$CountVariants$nIndels == 0] = NaN
|
||||
p <- ggplot(data=subset(metricsBySites$byAC$CountVariants, Novelty == "all" & nSNPs > 0), aes(x=AC, y=SNP.Indel.Ratio))
|
||||
p <- p + opts(title = name)
|
||||
p <- p + scale_y_continuous("SNP to indel ratio")
|
||||
#p <- p + scale_y_log10()
|
||||
|
|
@ -157,7 +158,7 @@ createMetricsBySamples <- function(VariantEvalRoot) {
|
|||
# add highlight info
|
||||
r$highlight = r$Sample %in% highlightSamples
|
||||
|
||||
#r = merge(merge(preQCMetrics, byAFEval$TiTvVariantEvaluator), byAFEval$CountVariants)
|
||||
#r = merge(merge(preQCMetrics, byACEval$TiTvVariantEvaluator), byACEval$CountVariants)
|
||||
return(subset(r, Sample != "all"))
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue