Now uses AC directly from eval, not via AF, internally for AC vs. X plotting. Requires at least 1 SNP to include a site in TiTv plotting or snp/indel ratio. Uses .byAC not .byAF eval file now
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6014 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
64196b6c7a
commit
907018768c
37
R/exomeQC.R
37
R/exomeQC.R
|
|
@ -29,8 +29,9 @@ if ( onCMDLine ) {
|
||||||
highlightSamples = c()
|
highlightSamples = c()
|
||||||
} else {
|
} else {
|
||||||
ProjectName = "InDevelopmentInR"
|
ProjectName = "InDevelopmentInR"
|
||||||
preQCFile <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
|
preQCFile <- NA # "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
|
||||||
VariantEvalRoot <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated"
|
#VariantEvalRoot <- "qcTestData//ESPGO_Gabriel_NHLBI_eomi_june_2011_batch1"
|
||||||
|
VariantEvalRoot <- "qcTestData/MC_Engle_11_Samples_06092011"
|
||||||
outputPDF = "bar.pdf"
|
outputPDF = "bar.pdf"
|
||||||
highlightSamples = c() # parseHighlightSamples("29029,47243")
|
highlightSamples = c() # parseHighlightSamples("29029,47243")
|
||||||
}
|
}
|
||||||
|
|
@ -52,15 +53,13 @@ expandVEReport <- function(d) {
|
||||||
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
|
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
|
||||||
# Metrics by sites:
|
# Metrics by sites:
|
||||||
# bySite -> counts of SNPs and Indels by novelty, with expectations
|
# bySite -> counts of SNPs and Indels by novelty, with expectations
|
||||||
# byAF -> snps and indels (known / novel)
|
# byAC -> snps and indels (known / novel)
|
||||||
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
|
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
|
||||||
byAF = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAF.eval", sep="")))
|
byAC = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAC.eval", sep="")))
|
||||||
r$byAF$CountVariants$nIndels = r$byAF$CountVariants$nInsertions + r$byAF$CountVariants$nDeletions
|
r$byAC$CountVariants$nIndels = r$byAC$CountVariants$nInsertions + r$byAC$CountVariants$nDeletions
|
||||||
|
r$byAC$TiTvVariantEvaluator$nSNPs = r$byAC$TiTvVariantEvaluator$nTi + r$byAC$TiTvVariantEvaluator$nTv
|
||||||
nChrom = 1 / min(r$byAF$TiTvVariantEvaluator$AlleleFrequency[r$byAF$TiTvVariantEvaluator$AlleleFrequency > 0])
|
r$byAC$CountVariants$AC = r$byAC$CountVariants$AlleleCount
|
||||||
r$byAF$TiTvVariantEvaluator$nSNPs = r$byAF$TiTvVariantEvaluator$nTi + r$byAF$TiTvVariantEvaluator$nTv
|
r$byAC$TiTvVariantEvaluator$AC = r$byAC$TiTvVariantEvaluator$AlleleCount
|
||||||
r$byAF$CountVariants$AC = r$byAF$CountVariants$AlleleFrequency * nChrom
|
|
||||||
r$byAF$TiTvVariantEvaluator$AC = r$byAF$TiTvVariantEvaluator$AlleleFrequency * nChrom
|
|
||||||
return(r)
|
return(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -77,7 +76,7 @@ summaryTable <- function(metricsBySites, metricsBySample) {
|
||||||
|
|
||||||
summaryPlots <- function(metricsBySites) {
|
summaryPlots <- function(metricsBySites) {
|
||||||
name = "SNP and Indel count by novelty and allele frequency"
|
name = "SNP and Indel count by novelty and allele frequency"
|
||||||
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
|
molten = melt(subset(metricsBySites$byAC$CountVariants, Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
|
||||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
|
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
|
||||||
p <- p + opts(title = name)
|
p <- p + opts(title = name)
|
||||||
p <- p + scale_y_log10("Number of variants")
|
p <- p + scale_y_log10("Number of variants")
|
||||||
|
|
@ -92,7 +91,7 @@ summaryPlots <- function(metricsBySites) {
|
||||||
# Counts vs. Allele frequency
|
# Counts vs. Allele frequency
|
||||||
name = "Variant counts by allele count"
|
name = "Variant counts by allele count"
|
||||||
for ( measure in c("nSNPs", "nIndels")) {
|
for ( measure in c("nSNPs", "nIndels")) {
|
||||||
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
|
molten = melt(subset(metricsBySites$byAC$CountVariants, AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
|
||||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
|
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
|
||||||
p <- p + opts(title = paste(name, ":", measure))
|
p <- p + opts(title = paste(name, ":", measure))
|
||||||
p <- p + scale_y_log10("Number of variants")
|
p <- p + scale_y_log10("Number of variants")
|
||||||
|
|
@ -104,8 +103,10 @@ summaryPlots <- function(metricsBySites) {
|
||||||
}
|
}
|
||||||
|
|
||||||
name = "Transition / transversion ratio by allele count"
|
name = "Transition / transversion ratio by allele count"
|
||||||
byAFNoAll = subset(metricsBySites$byAF$TiTvVariantEvaluator, Novelty != "all" & FunctionalClass == "all" & AC > 0)
|
# nSNPs > 0 => requires that we have some data here, otherwise Ti/Tv is zero from VE
|
||||||
p <- ggplot(data=byAFNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
|
minSNPsToInclude = 0
|
||||||
|
byACNoAll = subset(metricsBySites$byAC$TiTvVariantEvaluator, Novelty != "all" & AC > 0 & nSNPs > minSNPsToInclude)
|
||||||
|
p <- ggplot(data=byACNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
|
||||||
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
|
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
|
||||||
p <- p + opts(title = name)
|
p <- p + opts(title = name)
|
||||||
p <- p + geom_smooth(size=2)
|
p <- p + geom_smooth(size=2)
|
||||||
|
|
@ -117,9 +118,9 @@ summaryPlots <- function(metricsBySites) {
|
||||||
|
|
||||||
# SNPs to indels ratio by allele frequency
|
# SNPs to indels ratio by allele frequency
|
||||||
name = "SNPs to indels ratio by allele frequency"
|
name = "SNPs to indels ratio by allele frequency"
|
||||||
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio = metricsBySites$byAF$CountVariants$nSNPs / metricsBySites$byAF$CountVariants$nIndels
|
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio = metricsBySites$byAC$CountVariants$nSNPs / metricsBySites$byAC$CountVariants$nIndels
|
||||||
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio[metricsBySites$byAF$CountVariants$nIndels == 0] = NaN
|
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio[metricsBySites$byAC$CountVariants$nIndels == 0] = NaN
|
||||||
p <- ggplot(data=subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty == "all"), aes(x=AC, y=SNP.Indel.Ratio))
|
p <- ggplot(data=subset(metricsBySites$byAC$CountVariants, Novelty == "all" & nSNPs > 0), aes(x=AC, y=SNP.Indel.Ratio))
|
||||||
p <- p + opts(title = name)
|
p <- p + opts(title = name)
|
||||||
p <- p + scale_y_continuous("SNP to indel ratio")
|
p <- p + scale_y_continuous("SNP to indel ratio")
|
||||||
#p <- p + scale_y_log10()
|
#p <- p + scale_y_log10()
|
||||||
|
|
@ -157,7 +158,7 @@ createMetricsBySamples <- function(VariantEvalRoot) {
|
||||||
# add highlight info
|
# add highlight info
|
||||||
r$highlight = r$Sample %in% highlightSamples
|
r$highlight = r$Sample %in% highlightSamples
|
||||||
|
|
||||||
#r = merge(merge(preQCMetrics, byAFEval$TiTvVariantEvaluator), byAFEval$CountVariants)
|
#r = merge(merge(preQCMetrics, byACEval$TiTvVariantEvaluator), byACEval$CountVariants)
|
||||||
return(subset(r, Sample != "all"))
|
return(subset(r, Sample != "all"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue