AnalyzeAnnotations now breaks out titv by calls in hapmap and also plots true positive rates. Any RODs passed in whose name starts with 'truth' is considered to be the truth set.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2726 348d0f76-0448-11de-a6fe-93d51630548a
2010-01-28 21:41:23 +00:00 · 2010-01-28 21:41:23 +00:00 · 79c4cc1db7
parent 7a10c40fb3
commit 79c4cc1db7
5 changed files with 100 additions and 82 deletions
--- a/R/plot_Annotations_BinnedTruthMetrics.R
+++ b/R/plot_Annotations_BinnedTruthMetrics.R
@ -12,6 +12,7 @@ c <- read.table(input, header=T)
 all = c[c$numVariants>minBinCutoff & c$category=="all",]
 novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
 dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
+truth = c[c$numVariants>minBinCutoff & c$category=="truth",]

 d = c[c$numVariants>minBinCutoff,]
 ymin = min(d$titv)
@ -38,7 +39,12 @@ abline(v=m75,lty=2)
 abline(v=m25,lty=2)
 points(novel$value,novel$titv,col="green",pch=20)
 points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
 legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
 dev.off()

 #
@ -55,47 +61,63 @@ abline(v=m75,lty=2)
 abline(v=m25,lty=2)
 points(novel$value,novel$titv,col="green",pch=20)
 points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
 legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
 dev.off()

 #
-# Plot dbsnp rate as a function of the annotation
+# Plot dbsnp and true positive rate as a function of the annotation
 #

-outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="")
+outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
-plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
-axis(1,axTicks(1), format(axTicks(1), scientific=F))
-m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
-ma = all[all$value > m,]
-mb = all[all$value < m,]
-m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
-m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
-abline(v=m,lty=2)
-abline(v=m75,lty=2)
-abline(v=m25,lty=2)
-dev.off()
-
-#
-# Plot dbsnp rate as a function of the annotation, log scale on the x-axis
-#
-
-outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="")
-pdf(outfile, height=7, width=7)
-par(cex=1.1)
-plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+yLabel = "DBsnp/Truth Rate"
+}
+plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14);
 axis(1,axTicks(1), format(axTicks(1), scientific=F))
 abline(v=m,lty=2)
 abline(v=m75,lty=2)
 abline(v=m25,lty=2)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(all$value,all$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
+dev.off()
+
+#
+# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
+#
+
+outfile = paste(input, ".truthRate_log.", annotationName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+yLabel = "DBsnp/Truth Rate"
+}
+plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2)
+abline(v=m75,lty=2)
+abline(v=m25,lty=2)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(all$value,all$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
 dev.off()

 #
 # Plot histogram of the annotation's value
 #

-outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="")
+outfile = paste(input, "annotationHistogram.", annotationName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
 plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14);
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@ -99,7 +99,7 @@ public abstract class TraversalEngine {
                (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
        logger.info(String.format("  -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
        logger.info(String.format("  -> %d duplicate reads", TraversalStatistics.nDuplicates));
-        logger.info(String.format("  -> %d non-primary reads", TraversalStatistics.nNotPrimary));
+        logger.info(String.format("  -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary));
        logger.info(String.format("  -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
        logger.info(String.format("  -> %d reads with indels", TraversalStatistics.nSkippedIndels));
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java
@ -84,13 +84,23 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {

    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {

-        // Add each VCF record to each annotation's data structure
        if( tracker != null ) {
+
+            // First find out if this variant is in the truth set
+            boolean isTrueVariant = false;
            for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
-                if( rod != null && rod instanceof RodVCF ) {
-                    RodVCF variant = (RodVCF) rod;
+                if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) {
+                    isTrueVariant = true;
+                    break; // at least one of the truth files has this variant, so break out
+                }
+            }
+            
+            // Add each annotation in this VCF Record to the dataManager
+            for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
+                if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
+                    final RodVCF variant = (RodVCF) rod;
                    if( variant.isSNP() ) {
-                        dataManager.addAnnotations( variant, SAMPLE_NAME );
+                        dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant );
                    }
                }
            }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
@ -48,7 +48,7 @@ public class AnnotationDataManager {
        data = new HashMap<String, TreeSet<AnnotationDatum>>();
    }

-    public void addAnnotations( final RodVCF variant, final String sampleName ) {
+    public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) {

        if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
            if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
@ -80,7 +80,6 @@ public class AnnotationDataManager {
            }

            final boolean isNovelVariant = variant.getID().equals(".");
-            final boolean isTrueVariant = false; //BUGBUG: Check truth input file to see if this variant is in the truth set

            // Decide if the variant is a transition or transversion
            if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) {
@ -109,36 +108,37 @@ public class AnnotationDataManager {
            }

            // Output a header line
-            output.println("value\ttitv\tdbsnp\tnumVariants\tcategory");
+            output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory");

            // Bin SNPs and calculate truth metrics for each bin
            thisAnnotationBin.clearBin();
            for( AnnotationDatum datum : data.get( annotationKey ) ) {
                thisAnnotationBin.combine( datum );
                if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
-                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() +
+                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
                            "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
-                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
-                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
+                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
+                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
+                    output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
                    thisAnnotationBin.clearBin();
                }
                // else, continue accumulating variants because this bin isn't full yet
            }

            if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out
-                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + 
-                        "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
-                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
-                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
+                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
+                            "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
+                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
+                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
+                output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
                thisAnnotationBin.clearBin();
-
            }

            // Close the PrintStream
            output.close();

            // Print out the command line to make it clear to the user what is being executed and how one might modify it
-            final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
+            final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
                                   OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN;
            System.out.println( rScriptCommandLine );

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java
@ -41,17 +41,18 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
    private final int[] ti;
    private final int[] tv;

-    public static final int FULL_SET = -1;
-    public static final int NOVEL_SET = 0;
-    public static final int DBSNP_SET = 1;
-    public static final int TRUTH_SET = 2;
+    public static final int FULL_SET = 0;
+    public static final int NOVEL_SET = 1;
+    public static final int DBSNP_SET = 2;
+    public static final int TRUTH_SET = 3;
+    private static final int NUM_SETS = 4;

    public AnnotationDatum() {

        value = 0.0f;
-        ti = new int[3];
-        tv = new int[3];
-        for( int iii = 0; iii < 3; iii++ ) {
+        ti = new int[NUM_SETS];
+        tv = new int[NUM_SETS];
+        for( int iii = 0; iii < NUM_SETS; iii++ ) {
            ti[iii] = 0;
            tv[iii] = 0;
        }
@ -60,9 +61,9 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
    public AnnotationDatum( float _value ) {

        value = _value;
-        ti = new int[3];
-        tv = new int[3];
-        for( int iii = 0; iii < 3; iii++ ) {
+        ti = new int[NUM_SETS];
+        tv = new int[NUM_SETS];
+        for( int iii = 0; iii < NUM_SETS; iii++ ) {
            ti[iii] = 0;
            tv[iii] = 0;
        }
@ -70,6 +71,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {

    final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {

+        ti[FULL_SET]++;
        if( isNovelVariant ) {
            ti[NOVEL_SET]++;
        } else { // Is known, in DBsnp
@ -82,6 +84,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {

    final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {

+        tv[FULL_SET]++;
        if( isNovelVariant ) {
            tv[NOVEL_SET]++;
        } else { // Is known, in DBsnp
@ -94,7 +97,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {

    final public void combine( final AnnotationDatum that ) {

-        for( int iii = 0; iii < 3; iii++ ) {
+        for( int iii = 0; iii < NUM_SETS; iii++ ) {
            this.ti[iii] += that.ti[iii];
            this.tv[iii] += that.tv[iii];
        }
@ -103,61 +106,44 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {

    final public float calcTiTv( final int INDEX ) {

-        if( INDEX == FULL_SET ) {
-            if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) {
-                throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
-            }
-
-            if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero
-                return 0.0f;
-            }
-
-            return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) /
-                    ((float) (tv[NOVEL_SET] + tv[DBSNP_SET]));
-        } else {
-            if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
-                throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
-            }
-
-            if( tv[INDEX] == 0 ) { // Don't divide by zero
-                return 0.0f;
-            }
-
-            return ((float) ti[INDEX]) / ((float) tv[INDEX]);
+        if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
+            throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
        }
+
+        if( tv[INDEX] == 0 ) { // Don't divide by zero
+            return 0.0f;
+        }
+
+        return ((float) ti[INDEX]) / ((float) tv[INDEX]);
    }

    final public float calcDBsnpRate() {

-        if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
+        if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
            return 0.0f;
        }

        return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
-                ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
+                ((float) ti[FULL_SET] + tv[FULL_SET]);
    }

    final public float calcTPrate() {

-        if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
+        if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
            return 0.0f;
        }

        return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
-                ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
+                ((float) ti[FULL_SET] + tv[FULL_SET]);
    }

    final public int numVariants( final int INDEX ) {
-        if( INDEX == FULL_SET ) {
-            return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET];
-        } else {
-            return ti[INDEX] + tv[INDEX];
-        }
+        return ti[INDEX] + tv[INDEX];
    }

    final public void clearBin() {
        value = 0.0f;
-        for( int iii = 0; iii < 3; iii++ ) {
+        for( int iii = 0; iii < NUM_SETS; iii++ ) {
            ti[iii] = 0;
            tv[iii] = 0;
        }