From 46f3d3e39b3fb3a3faee7a9dbdf9bd55e0c819f2 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 2 Dec 2009 23:15:52 +0000 Subject: [PATCH] Added comments to AnalyzeCovariates and R scripts. R script prevents residuals from going off the edge of the plot. Added skeleton code to the recalibration walkers showing how we plan to handle SOLID reference inserting behavior. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2233 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_residualError_OtherCovariate.R | 37 ++++++++++++------- R/plot_residualError_QualityScoreCovariate.R | 21 ++++++----- .../AnalysisDataManager.java | 3 ++ .../analyzecovariates/AnalyzeCovariates.java | 2 +- .../recalibration/CovariateCounterWalker.java | 19 +++++----- .../recalibration/RecalDataManager.java | 11 ++++++ .../TableRecalibrationWalker.java | 22 +---------- 7 files changed, 61 insertions(+), 54 deletions(-) diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R index 55eba21c4..a85149061 100644 --- a/R/plot_residualError_OtherCovariate.R +++ b/R/plot_residualError_OtherCovariate.R @@ -6,36 +6,47 @@ verbose = TRUE input = args[1] covariateName = args[2] -#X11(width=7, height=14) -#outfile = paste(input, ".qual_diff_v_cycle.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) c <- read.table(input, header=T) c <- c[sort.list(c[,1]),] +# +# Plot residual error as a function of the covariate +# + d.good <- c[c$nBases >= 1000,] d.1000 <- c[c$nBases < 1000,] -rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) +rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) ) -theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) if( length(d.good$nBases) == length(c$nBases) ) { theTitle = paste("RMSE = ", round(rmseAll,digits=3)) - } +} +# Don't let residual error go off the edge of the plot +d.good$residualError = d.good$Qempirical-d.good$Qreported +d.good$residualError[which(d.good$residualError > 10)] = 10 +d.good$residualError[which(d.good$residualError < -10)] = -10 +d.1000$residualError = d.1000$Qempirical-d.1000$Qreported +d.1000$residualError[which(d.1000$residualError > 10)] = 10 +d.1000$residualError[which(d.1000$residualError < -10)] = -10 +c$residualError = c$Qempirical-c$Qreported +c$residualError[which(c$residualError > 10)] = 10 +c$residualError[which(c$residualError < -10)] = -10 if( is.numeric(c$Covariate) ) { - plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) - points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=20) -} else { - plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) - points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue") + plot(d.good$Covariate, d.good$residualError, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$residualError, type="p", col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") } dev.off() -#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16) # # Plot histogram of the covariate # + e = d.good f = d.1000 outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") @@ -46,7 +57,7 @@ if( is.numeric(c$Covariate) ) { plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=2, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=2, col="cornflowerblue") axis(2,axTicks(2), format(axTicks(2), scientific=F)) -} else { +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=7, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n") axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R index 278b57efd..8dfaf0b3d 100644 --- a/R/plot_residualError_QualityScoreCovariate.R +++ b/R/plot_residualError_QualityScoreCovariate.R @@ -6,11 +6,11 @@ input = args[1] Qcutoff = as.numeric(args[2]) t=read.table(input, header=T) -#t=read.csv(input) -#par(mfrow=c(2,1), cex=1.2) -#outfile = paste(input, ".quality_emp_v_stated.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +# +# Plot of reported quality versus empirical quality +# + outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") pdf(outfile, height=7, width=7) d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] @@ -18,12 +18,12 @@ d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] f <- t[t$Qreported < Qcutoff,] e <- rbind(d.good, d.1000, d.10000) -rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) +rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) ) -theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) if (length(t$nBases) == length(d.good$nBases) ) { theTitle = paste("RMSE = ", round(rmseAll,digits=3)); - } +} plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score") points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) @@ -31,8 +31,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) abline(0,1, lty=2) dev.off() -#outfile = paste(input, ".quality_emp_hist.png", sep="") -#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +# +# Plot Q empirical histogram +# + outfile = paste(input, ".quality_emp_hist.pdf", sep="") pdf(outfile, height=7, width=7) hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) @@ -45,6 +47,7 @@ dev.off() # # Plot Q reported histogram # + outfile = paste(input, ".quality_rep_hist.pdf", sep="") pdf(outfile, height=7, width=7) hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java index 3c5e12ebd..88e0d7494 100755 --- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java @@ -10,6 +10,9 @@ import java.util.List; * Created by IntelliJ IDEA. * User: rpoplin * Date: Dec 1, 2009 + * + * The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed + * by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate. */ public class AnalysisDataManager { diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index b6eeebc0c..9c5440cda 100755 --- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -292,7 +292,7 @@ public class AnalyzeCovariates { } else { // Analyze all other covariates Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + - cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice + cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice } } catch (IOException e) { e.printStackTrace(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 99d995d34..f5c5bf426 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -101,7 +101,7 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.0.11"; // Major version, minor version, and build number + private static final String versionString = "v2.0.12"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -268,7 +268,6 @@ public class CovariateCounterWalker extends LocusWalker { int offset; byte refBase; byte prevBase; - byte[] colorSpaceQuals; byte[] bases; // For each read at this locus @@ -298,16 +297,16 @@ public class CovariateCounterWalker extends LocusWalker { // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { - // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them - colorSpaceQuals = null; + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base + // so decrease the quality of this base by forcing it to be a mismatch if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { - colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); - } - if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet - { - // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( read, offset, refBase ); + if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) { + refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch + } } + + // This base finally passed all the checks for a good base, so add it to the big data hashmap + updateDataFromRead( read, offset, refBase ); } else { if( RAC.VALIDATE_OLD_RECALIBRATOR ) { countedBases++; // Replicating a small bug in the old recalibrator diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 2e74067e4..f5bedae0e 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -56,6 +56,7 @@ public class RecalDataManager { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams private static boolean warnUserNullReadGroup = false; RecalDataManager() { @@ -243,4 +244,14 @@ public class RecalDataManager { readGroup.setPlatform( RAC.FORCE_PLATFORM ); } } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * @param base The nucleotide we are checking + * @param read The read which contains the color space to check against + * @return Returns true if the base is inconsistent with the color space + */ + public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) { + return false; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index d972d1ddf..bd4c8f50f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker