Added comments to AnalyzeCovariates and R scripts. R script prevents residuals from going off the edge of the plot. Added skeleton code to the recalibration walkers showing how we plan to handle SOLID reference inserting behavior.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2233 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-02 23:15:52 +00:00 · 2009-12-02 23:15:52 +00:00 · 46f3d3e39b
parent 451a20ed55
commit 46f3d3e39b
7 changed files with 61 additions and 54 deletions
--- a/R/plot_residualError_OtherCovariate.R
+++ b/R/plot_residualError_OtherCovariate.R
@ -6,36 +6,47 @@ verbose = TRUE
 input = args[1]
 covariateName = args[2]

-#X11(width=7, height=14)
-#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
 outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
 c <- read.table(input, header=T)
 c <- c[sort.list(c[,1]),]

+#
+# Plot residual error as a function of the covariate
+#
+
 d.good <- c[c$nBases >= 1000,]
 d.1000 <- c[c$nBases < 1000,]
-rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
+rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
 rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) )
-theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
+theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
 if( length(d.good$nBases) == length(c$nBases) ) {
 	theTitle = paste("RMSE = ", round(rmseAll,digits=3))
-	}
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical-d.good$Qreported
+d.good$residualError[which(d.good$residualError > 10)] = 10
+d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
+d.1000$residualError[which(d.1000$residualError > 10)] = 10
+d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical-c$Qreported
+c$residualError[which(c$residualError > 10)] = 10
+c$residualError[which(c$residualError < -10)] = -10
 if( is.numeric(c$Covariate) ) {
-	plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
-	points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=20)
-} else {
-	plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", 	xlab=covariateName, col="blue", ylim=c(-10, 10))
-	points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue")
+	plot(d.good$Covariate, d.good$residualError, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$residualError, type="p", col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
+	points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
 }
 dev.off()
-#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)

 #
 # Plot histogram of the covariate
 #
+
 e = d.good
 f = d.1000
 outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
@ -46,7 +57,7 @@ if( is.numeric(c$Covariate) ) {
 	plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=2, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
 	points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=2, col="cornflowerblue")
 	axis(2,axTicks(2), format(axTicks(2), scientific=F))
-} else {
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
 	hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
 	plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=7, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n")
 	axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
--- a/R/plot_residualError_QualityScoreCovariate.R
+++ b/R/plot_residualError_QualityScoreCovariate.R
@ -6,11 +6,11 @@ input = args[1]
 Qcutoff = as.numeric(args[2])

 t=read.table(input, header=T)
-#t=read.csv(input)
-#par(mfrow=c(2,1), cex=1.2)

-#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+#
+# Plot of reported quality versus empirical quality
+#
+
 outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
 pdf(outfile, height=7, width=7)
 d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
@ -18,12 +18,12 @@ d.1000 <- t[t$nBases < 1000  & t$Qreported >= Qcutoff,]
 d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000  & t$Qreported >= Qcutoff,]
 f <- t[t$Qreported < Qcutoff,]
 e <- rbind(d.good, d.1000, d.10000)
-rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
+rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
 rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) )
-theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
+theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
 if (length(t$nBases) == length(d.good$nBases) ) {
 	theTitle = paste("RMSE = ", round(rmseAll,digits=3));
-	}
+}
 plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
 points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
 points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
@ -31,8 +31,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
 abline(0,1, lty=2)
 dev.off()

-#outfile = paste(input, ".quality_emp_hist.png", sep="")
-#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+#
+# Plot Q empirical histogram
+#
+
 outfile = paste(input, ".quality_emp_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
@ -45,6 +47,7 @@ dev.off()
 #
 # Plot Q reported histogram
 #
+
 outfile = paste(input, ".quality_rep_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
--- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java
+++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java
@ -10,6 +10,9 @@ import java.util.List;
 * Created by IntelliJ IDEA.
 * User: rpoplin
 * Date: Dec 1, 2009
+ *
+ * The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed
+ *  by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate.
 */

 public class AnalysisDataManager {
--- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@ -292,7 +292,7 @@ public class AnalyzeCovariates {
                        } else { // Analyze all other covariates
                            Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
                                        OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
-                                        cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice
+                                        cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@ -101,7 +101,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
    private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
    private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
    private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
-    private static final String versionString = "v2.0.11"; // Major version, minor version, and build number
+    private static final String versionString = "v2.0.12"; // Major version, minor version, and build number
    private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L);  // mismatch/base counts for dbSNP loci
    private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L);  // mismatch/base counts for non-dbSNP loci
    private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0;        // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -268,7 +268,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
            int offset;
            byte refBase;
            byte prevBase;
-            byte[] colorSpaceQuals;
            byte[] bases;

            // For each read at this locus
@ -298,16 +297,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
                            // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. 
                            if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {

-                                // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
-                                colorSpaceQuals = null;
+                                // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base
+                                //  so decrease the quality of this base by forcing it to be a mismatch
                                if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
-                                    colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
-                                }
-                                if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
-                                {
-                                    // This base finally passed all the checks for a good base, so add it to the big data hashmap
-                                    updateDataFromRead( read, offset, refBase );
+                                    if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) {
+                                        refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch
+                                    }
                                }
+
+                                // This base finally passed all the checks for a good base, so add it to the big data hashmap
+                                updateDataFromRead( read, offset, refBase );
                            } else {
                                if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
                                    countedBases++; // Replicating a small bug in the old recalibrator
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@ -56,6 +56,7 @@ public class RecalDataManager {

    public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
    public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
+    public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
    private static boolean warnUserNullReadGroup = false;

    RecalDataManager() {
@ -243,4 +244,14 @@ public class RecalDataManager {
            readGroup.setPlatform( RAC.FORCE_PLATFORM );
        }
    }
+
+    /**
+     * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
+     * @param base The nucleotide we are checking
+     * @param read The read which contains the color space to check against
+     * @return Returns true if the base is inconsistent with the color space
+     */
+    public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) {
+        return false;
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
    private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
    private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
    private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
-    private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
+    private static final String versionString = "v2.0.10"; // Major version, minor version, and build number
    private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam

    //---------------------------------------------------------------------------------------------------------------
@ -328,12 +328,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite

        preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low

-        // SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
-        if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
-            byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
-            if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
-        }
-
        read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
        if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
            read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
@ -430,20 +424,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
        }
    }

-    /**
-     * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
-     * @param originalQuals The list of original base quality scores
-     * @param recalQuals A list of the new recalibrated quality scores
-     * @param colorSpaceQuals The list of color space quality scores for this read
-     */
-    private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
-        for( int iii = 0; iii < recalQuals.length; iii++ ) {
-            if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
-                recalQuals[iii] = originalQuals[iii];
-            }
-        }
-    }
-
    //---------------------------------------------------------------------------------------------------------------
    //
    // reduce