Expanded the BQSR reporting script

-- Includes header page
-- Table of arguments (Arguments)
-- Summary of counts (RecalData0)
-- Summary of counts by qual (RecalData1)
-- Fixed bug in output that resulted in covariates list always being null (updated md5s accordingly)
-- BQSR.R loads all relevant libaries now, include gplots, grid, and gsalib to run correctly
This commit is contained in:
Mark DePristo 2012-08-12 13:44:07 -04:00
parent 458bbdee8f
commit 243af0adb1
4 changed files with 79 additions and 29 deletions

View File

@ -50,20 +50,20 @@ public class BQSRIntegrationTest extends WalkerTest {
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
String HiSeqInterval = "chr1:10,000,000-10,100,000";
return new Object[][]{
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "239ce3387b4540faf44ec000d844ccd1")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "d69127341938910c38166dd18449598d")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "b77e621bed1b0dc57970399a35efd0da")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "2697f38d467a7856c40abce0f778456a")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "a55018b1643ca3964dbb50783db9f3e4")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "54fe8d1f5573845e6a2aa9688f6dd950")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "6b518ad3c56d66c6f5ea812d058f5c4d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "3ddb9730f00ee3a612b42209ed9f7e03")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4cd4fb754e1ef142ad691cb35c74dc4c")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "364eab693e5e4c7d18a77726b6460f3f")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "c449cfca61d605b534f0dce35581339d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "5268cb5a4b69335568751d5e5ab80d43")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "3ddb9730f00ee3a612b42209ed9f7e03")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "4a786ba42e38e7fd101947c34a6883ed")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")},
};
}

View File

@ -1,8 +1,18 @@
library("ggplot2")
library(gplots)
library("reshape")
library("grid")
library("tools") #For compactPDF in R 2.13+
library(gsalib)
args <- commandArgs(TRUE)
if ( interactive() ) {
args <- c("NA12878.6.1.dedup.realign.recal.bqsr.grp.csv", "NA12878.6.1.dedup.realign.recal.bqsr.grp", NA)
} else {
args <- commandArgs(TRUE)
}
data <- read.csv(args[1])
gsa.report <- gsa.read.gatkreport(args[2])
data <- within(data, EventType <- factor(EventType, levels = rev(levels(EventType))))
numRG = length(unique(data$ReadGroup))
@ -82,20 +92,45 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
p <- ggplot(d, aes(x=CovariateValue)) +
xlab(paste(cov,"Covariate")) +
ylab("Number of Observations") +
ylab("No. of Observations (area normalized)") +
blankTheme
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations),alpha=0.6,binwidth=1,position="identity") + scale_fill_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
scale_y_continuous(formatter="comma")
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations,y=..ndensity..),alpha=0.6,binwidth=1,position="identity")
d <- d + scale_fill_manual(values=c("maroon1","blue"))
d <- d + facet_grid(.~EventType)
# d <- d + scale_y_continuous(formatter="comma")
}
}
pdf(args[2],height=9,width=15)
if ( ! is.na(args[3]) )
pdf(args[3],height=9,width=15)
#frame()
textplot(gsa.report$Arguments, show.rownames=F)
title(
main="GATK BaseRecalibration report",
sub=date())
distributeGraphRows(list(a,b,c), c(1,1,1))
distributeGraphRows(list(d,e,f), c(1,1,1))
dev.off()
# format the overall information
rt0 <- data.frame(
ReadGroup = gsa.report$RecalTable0$ReadGroup,
EventType = gsa.report$RecalTable0$EventType,
EmpiricalQuality = sprintf("%.1f", gsa.report$RecalTable0$EmpiricalQuality),
EstimatedQReported = sprintf("%.1f", gsa.report$RecalTable0$EstimatedQReported),
Observations = sprintf("%.2e", gsa.report$RecalTable0$Observations),
Errors = sprintf("%.2e", gsa.report$RecalTable0$Errors))
textplot(t(rt0), show.colnames=F)
title("Overall error rates by event type")
if (exists('compactPDF')) {
compactPDF(args[2])
# plot per quality score recalibration table
textplot(gsa.report$RecalTable1, show.rownames=F)
title("Rrror rates by event type and initial quality score")
if ( ! is.na(args[3]) ) {
dev.off()
if (exists('compactPDF')) {
compactPDF(args[2])
}
}

View File

@ -175,12 +175,12 @@ public class RecalibrationArgumentCollection {
public File recalibrationReport = null;
public GATKReportTable generateReportTable() {
public GATKReportTable generateReportTable(final String covariateNames) {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
argumentsTable.addColumn("Argument");
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
argumentsTable.addRowID("covariate", true);
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
argumentsTable.addRowID("no_standard_covs", true);
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
argumentsTable.addRowID("run_without_dbsnp", true);

View File

@ -326,9 +326,23 @@ public class RecalUtils {
}
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
}
/**
* Return a human-readable string representing the used covariates
*
* @param requestedCovariates a vector of covariates
* @return a non-null comma-separated string
*/
public static String covariateNames(final Covariate[] requestedCovariates) {
final List<String> names = new ArrayList<String>(requestedCovariates.length);
for ( final Covariate cov : requestedCovariates )
names.add(cov.getClass().getSimpleName());
return Utils.join(",", names);
}
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
}
@ -352,7 +366,7 @@ public class RecalUtils {
return new Pair<PrintStream, File>(deltaTableStream, deltaTableFileName);
}
private static void outputRecalibrationPlot(Pair<PrintStream, File> files, boolean keepIntermediates) {
private static void outputRecalibrationPlot(final File gatkReportFilename, Pair<PrintStream, File> files, boolean keepIntermediates) {
final File csvFileName = files.getSecond();
final File plotFileName = new File(csvFileName + ".pdf");
files.getFirst().close();
@ -360,6 +374,7 @@ public class RecalUtils {
final RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
executor.addArgs(csvFileName.getAbsolutePath());
executor.addArgs(gatkReportFilename.getAbsolutePath());
executor.addArgs(plotFileName.getAbsolutePath());
executor.exec();
@ -372,14 +387,14 @@ public class RecalUtils {
public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) {
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true);
outputRecalibrationPlot(files, keepIntermediates);
outputRecalibrationPlot(filename, files, keepIntermediates);
}
public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) {
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true);
writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false);
outputRecalibrationPlot(files, keepIntermediates);
outputRecalibrationPlot(filename, files, keepIntermediates);
}
private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) {