From 46f3d3e39b3fb3a3faee7a9dbdf9bd55e0c819f2 Mon Sep 17 00:00:00 2001
From: rpoplin <rpoplin@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Wed, 2 Dec 2009 23:15:52 +0000
Subject: [PATCH] Added comments to AnalyzeCovariates and R scripts. R script
 prevents residuals from going off the edge of the plot. Added skeleton code
 to the recalibration walkers showing how we plan to handle SOLID reference
 inserting behavior.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2233 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/plot_residualError_OtherCovariate.R         | 37 ++++++++++++-------
 R/plot_residualError_QualityScoreCovariate.R  | 21 ++++++-----
 .../AnalysisDataManager.java                  |  3 ++
 .../analyzecovariates/AnalyzeCovariates.java  |  2 +-
 .../recalibration/CovariateCounterWalker.java | 19 +++++-----
 .../recalibration/RecalDataManager.java       | 11 ++++++
 .../TableRecalibrationWalker.java             | 22 +----------
 7 files changed, 61 insertions(+), 54 deletions(-)

diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R
index 55eba21c4..a85149061 100644
--- a/R/plot_residualError_OtherCovariate.R
+++ b/R/plot_residualError_OtherCovariate.R
@@ -6,36 +6,47 @@ verbose = TRUE
 input = args[1]
 covariateName = args[2]
 
-#X11(width=7, height=14)
-#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
 outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
 c <- read.table(input, header=T)
 c <- c[sort.list(c[,1]),]
 
+#
+# Plot residual error as a function of the covariate
+#
+
 d.good <- c[c$nBases >= 1000,]
 d.1000 <- c[c$nBases < 1000,]
-rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
+rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
 rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) )
-theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
+theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
 if( length(d.good$nBases) == length(c$nBases) ) {
 	theTitle = paste("RMSE = ", round(rmseAll,digits=3))
-	}
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical-d.good$Qreported
+d.good$residualError[which(d.good$residualError > 10)] = 10
+d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
+d.1000$residualError[which(d.1000$residualError > 10)] = 10
+d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical-c$Qreported
+c$residualError[which(c$residualError > 10)] = 10
+c$residualError[which(c$residualError < -10)] = -10
 if( is.numeric(c$Covariate) ) {
-	plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
-	points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=20)
-} else {
-	plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", 	xlab=covariateName, col="blue", ylim=c(-10, 10))
-	points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue")
+	plot(d.good$Covariate, d.good$residualError, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$residualError, type="p", col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
+	points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
 }
 dev.off()
-#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
 
 #
 # Plot histogram of the covariate
 #
+
 e = d.good
 f = d.1000
 outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
@@ -46,7 +57,7 @@ if( is.numeric(c$Covariate) ) {
 	plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=2, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
 	points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=2, col="cornflowerblue")
 	axis(2,axTicks(2), format(axTicks(2), scientific=F))
-} else {
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
 	hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
 	plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=7, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n")
 	axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R
index 278b57efd..8dfaf0b3d 100644
--- a/R/plot_residualError_QualityScoreCovariate.R
+++ b/R/plot_residualError_QualityScoreCovariate.R
@@ -6,11 +6,11 @@ input = args[1]
 Qcutoff = as.numeric(args[2])
 
 t=read.table(input, header=T)
-#t=read.csv(input)
-#par(mfrow=c(2,1), cex=1.2)
 
-#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+#
+# Plot of reported quality versus empirical quality
+#
+
 outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
 pdf(outfile, height=7, width=7)
 d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
@@ -18,12 +18,12 @@ d.1000 <- t[t$nBases < 1000  & t$Qreported >= Qcutoff,]
 d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000  & t$Qreported >= Qcutoff,]
 f <- t[t$Qreported < Qcutoff,]
 e <- rbind(d.good, d.1000, d.10000)
-rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
+rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
 rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) )
-theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
+theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
 if (length(t$nBases) == length(d.good$nBases) ) {
 	theTitle = paste("RMSE = ", round(rmseAll,digits=3));
-	}
+}
 plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
 points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
 points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
@@ -31,8 +31,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
 abline(0,1, lty=2)
 dev.off()
 
-#outfile = paste(input, ".quality_emp_hist.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+#
+# Plot Q empirical histogram
+#
+
 outfile = paste(input, ".quality_emp_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
@@ -45,6 +47,7 @@ dev.off()
 #
 # Plot Q reported histogram
 #
+
 outfile = paste(input, ".quality_rep_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java
index 3c5e12ebd..88e0d7494 100755
--- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java
+++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java
@@ -10,6 +10,9 @@ import java.util.List;
  * Created by IntelliJ IDEA.
  * User: rpoplin
  * Date: Dec 1, 2009
+ *
+ * The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed
+ *  by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate.
  */
 
 public class AnalysisDataManager {
diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index b6eeebc0c..9c5440cda 100755
--- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -292,7 +292,7 @@ public class AnalyzeCovariates {
                         } else { // Analyze all other covariates
                             Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
                                         OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
-                                        cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice
+                                        cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
                         }
                     } catch (IOException e) {
                         e.printStackTrace();
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
index 99d995d34..f5c5bf426 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@@ -101,7 +101,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
     private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
     private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
     private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
-    private static final String versionString = "v2.0.11"; // Major version, minor version, and build number
+    private static final String versionString = "v2.0.12"; // Major version, minor version, and build number
     private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L);  // mismatch/base counts for dbSNP loci
     private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L);  // mismatch/base counts for non-dbSNP loci
     private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0;        // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@@ -268,7 +268,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
             int offset;
             byte refBase;
             byte prevBase;
-            byte[] colorSpaceQuals;
             byte[] bases;
 
             // For each read at this locus
@@ -298,16 +297,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
                             // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. 
                             if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
 
-                                // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
-                                colorSpaceQuals = null;
+                                // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base
+                                //  so decrease the quality of this base by forcing it to be a mismatch
                                 if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
-                                    colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
-                                }
-                                if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
-                                {
-                                    // This base finally passed all the checks for a good base, so add it to the big data hashmap
-                                    updateDataFromRead( read, offset, refBase );
+                                    if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) {
+                                        refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch
+                                    }
                                 }
+
+                                // This base finally passed all the checks for a good base, so add it to the big data hashmap
+                                updateDataFromRead( read, offset, refBase );
                             } else {
                                 if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
                                     countedBases++; // Replicating a small bug in the old recalibrator
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
index 2e74067e4..f5bedae0e 100644
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@@ -56,6 +56,7 @@ public class RecalDataManager {
 
     public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
     public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
+    public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
     private static boolean warnUserNullReadGroup = false;
 
     RecalDataManager() {
@@ -243,4 +244,14 @@ public class RecalDataManager {
             readGroup.setPlatform( RAC.FORCE_PLATFORM );
         }
     }
+
+    /**
+     * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
+     * @param base The nucleotide we are checking
+     * @param read The read which contains the color space to check against
+     * @return Returns true if the base is inconsistent with the color space
+     */
+    public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) {
+        return false;
+    }
 }
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
index d972d1ddf..bd4c8f50f 100644
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
     private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
     private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
     private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
-    private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
+    private static final String versionString = "v2.0.10"; // Major version, minor version, and build number
     private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
 
     //---------------------------------------------------------------------------------------------------------------
@@ -328,12 +328,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
 
         preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
 
-        // SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
-        if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
-            byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
-            if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
-        }
-
         read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
         if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
             read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
@@ -430,20 +424,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
         }
     }
 
-    /**
-     * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
-     * @param originalQuals The list of original base quality scores
-     * @param recalQuals A list of the new recalibrated quality scores
-     * @param colorSpaceQuals The list of color space quality scores for this read
-     */
-    private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
-        for( int iii = 0; iii < recalQuals.length; iii++ ) {
-            if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
-                recalQuals[iii] = originalQuals[iii];
-            }
-        }
-    }
-
     //---------------------------------------------------------------------------------------------------------------
     //
     // reduce