Expanded the BQSR reporting script
-- Includes header page -- Table of arguments (Arguments) -- Summary of counts (RecalData0) -- Summary of counts by qual (RecalData1) -- Fixed bug in output that resulted in covariates list always being null (updated md5s accordingly) -- BQSR.R loads all relevant libaries now, include gplots, grid, and gsalib to run correctly
This commit is contained in:
parent
458bbdee8f
commit
243af0adb1
|
|
@ -50,20 +50,20 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "239ce3387b4540faf44ec000d844ccd1")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "d69127341938910c38166dd18449598d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "b77e621bed1b0dc57970399a35efd0da")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "2697f38d467a7856c40abce0f778456a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "a55018b1643ca3964dbb50783db9f3e4")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "54fe8d1f5573845e6a2aa9688f6dd950")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "6b518ad3c56d66c6f5ea812d058f5c4d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "3ddb9730f00ee3a612b42209ed9f7e03")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4cd4fb754e1ef142ad691cb35c74dc4c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "364eab693e5e4c7d18a77726b6460f3f")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "c449cfca61d605b534f0dce35581339d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "5268cb5a4b69335568751d5e5ab80d43")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "3ddb9730f00ee3a612b42209ed9f7e03")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "4a786ba42e38e7fd101947c34a6883ed")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,18 @@
|
|||
library("ggplot2")
|
||||
library(gplots)
|
||||
library("reshape")
|
||||
library("grid")
|
||||
library("tools") #For compactPDF in R 2.13+
|
||||
library(gsalib)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
if ( interactive() ) {
|
||||
args <- c("NA12878.6.1.dedup.realign.recal.bqsr.grp.csv", "NA12878.6.1.dedup.realign.recal.bqsr.grp", NA)
|
||||
} else {
|
||||
args <- commandArgs(TRUE)
|
||||
}
|
||||
data <- read.csv(args[1])
|
||||
gsa.report <- gsa.read.gatkreport(args[2])
|
||||
data <- within(data, EventType <- factor(EventType, levels = rev(levels(EventType))))
|
||||
|
||||
numRG = length(unique(data$ReadGroup))
|
||||
|
|
@ -82,20 +92,45 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
|
|||
|
||||
p <- ggplot(d, aes(x=CovariateValue)) +
|
||||
xlab(paste(cov,"Covariate")) +
|
||||
ylab("Number of Observations") +
|
||||
ylab("No. of Observations (area normalized)") +
|
||||
blankTheme
|
||||
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations),alpha=0.6,binwidth=1,position="identity") + scale_fill_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
scale_y_continuous(formatter="comma")
|
||||
|
||||
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations,y=..ndensity..),alpha=0.6,binwidth=1,position="identity")
|
||||
d <- d + scale_fill_manual(values=c("maroon1","blue"))
|
||||
d <- d + facet_grid(.~EventType)
|
||||
# d <- d + scale_y_continuous(formatter="comma")
|
||||
}
|
||||
}
|
||||
|
||||
pdf(args[2],height=9,width=15)
|
||||
if ( ! is.na(args[3]) )
|
||||
pdf(args[3],height=9,width=15)
|
||||
|
||||
#frame()
|
||||
textplot(gsa.report$Arguments, show.rownames=F)
|
||||
title(
|
||||
main="GATK BaseRecalibration report",
|
||||
sub=date())
|
||||
|
||||
distributeGraphRows(list(a,b,c), c(1,1,1))
|
||||
distributeGraphRows(list(d,e,f), c(1,1,1))
|
||||
dev.off()
|
||||
|
||||
# format the overall information
|
||||
rt0 <- data.frame(
|
||||
ReadGroup = gsa.report$RecalTable0$ReadGroup,
|
||||
EventType = gsa.report$RecalTable0$EventType,
|
||||
EmpiricalQuality = sprintf("%.1f", gsa.report$RecalTable0$EmpiricalQuality),
|
||||
EstimatedQReported = sprintf("%.1f", gsa.report$RecalTable0$EstimatedQReported),
|
||||
Observations = sprintf("%.2e", gsa.report$RecalTable0$Observations),
|
||||
Errors = sprintf("%.2e", gsa.report$RecalTable0$Errors))
|
||||
textplot(t(rt0), show.colnames=F)
|
||||
title("Overall error rates by event type")
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(args[2])
|
||||
# plot per quality score recalibration table
|
||||
textplot(gsa.report$RecalTable1, show.rownames=F)
|
||||
title("Rrror rates by event type and initial quality score")
|
||||
|
||||
if ( ! is.na(args[3]) ) {
|
||||
dev.off()
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(args[2])
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -175,12 +175,12 @@ public class RecalibrationArgumentCollection {
|
|||
|
||||
public File recalibrationReport = null;
|
||||
|
||||
public GATKReportTable generateReportTable() {
|
||||
public GATKReportTable generateReportTable(final String covariateNames) {
|
||||
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
|
||||
argumentsTable.addColumn("Argument");
|
||||
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
argumentsTable.addRowID("covariate", true);
|
||||
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
|
||||
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
|
||||
argumentsTable.addRowID("no_standard_covs", true);
|
||||
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
|
||||
argumentsTable.addRowID("run_without_dbsnp", true);
|
||||
|
|
|
|||
|
|
@ -326,9 +326,23 @@ public class RecalUtils {
|
|||
}
|
||||
|
||||
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) {
|
||||
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
|
||||
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a human-readable string representing the used covariates
|
||||
*
|
||||
* @param requestedCovariates a vector of covariates
|
||||
* @return a non-null comma-separated string
|
||||
*/
|
||||
public static String covariateNames(final Covariate[] requestedCovariates) {
|
||||
final List<String> names = new ArrayList<String>(requestedCovariates.length);
|
||||
for ( final Covariate cov : requestedCovariates )
|
||||
names.add(cov.getClass().getSimpleName());
|
||||
return Utils.join(",", names);
|
||||
}
|
||||
|
||||
|
||||
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) {
|
||||
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile);
|
||||
}
|
||||
|
|
@ -352,7 +366,7 @@ public class RecalUtils {
|
|||
return new Pair<PrintStream, File>(deltaTableStream, deltaTableFileName);
|
||||
}
|
||||
|
||||
private static void outputRecalibrationPlot(Pair<PrintStream, File> files, boolean keepIntermediates) {
|
||||
private static void outputRecalibrationPlot(final File gatkReportFilename, Pair<PrintStream, File> files, boolean keepIntermediates) {
|
||||
final File csvFileName = files.getSecond();
|
||||
final File plotFileName = new File(csvFileName + ".pdf");
|
||||
files.getFirst().close();
|
||||
|
|
@ -360,6 +374,7 @@ public class RecalUtils {
|
|||
final RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
|
||||
executor.addArgs(csvFileName.getAbsolutePath());
|
||||
executor.addArgs(gatkReportFilename.getAbsolutePath());
|
||||
executor.addArgs(plotFileName.getAbsolutePath());
|
||||
executor.exec();
|
||||
|
||||
|
|
@ -372,14 +387,14 @@ public class RecalUtils {
|
|||
public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) {
|
||||
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
|
||||
writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true);
|
||||
outputRecalibrationPlot(files, keepIntermediates);
|
||||
outputRecalibrationPlot(filename, files, keepIntermediates);
|
||||
}
|
||||
|
||||
public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) {
|
||||
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
|
||||
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true);
|
||||
writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false);
|
||||
outputRecalibrationPlot(files, keepIntermediates);
|
||||
outputRecalibrationPlot(filename, files, keepIntermediates);
|
||||
}
|
||||
|
||||
private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue