diff --git a/R/plot_Annotations_BinnedTiTv.R b/R/plot_Annotations_BinnedTruthMetrics.R similarity index 59% rename from R/plot_Annotations_BinnedTiTv.R rename to R/plot_Annotations_BinnedTruthMetrics.R index 31d139559..182edfb2c 100644 --- a/R/plot_Annotations_BinnedTiTv.R +++ b/R/plot_Annotations_BinnedTruthMetrics.R @@ -12,6 +12,7 @@ c <- read.table(input, header=T) all = c[c$numVariants>minBinCutoff & c$category=="all",] novel = c[c$numVariants>minBinCutoff & c$category=="novel",] dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] d = c[c$numVariants>minBinCutoff,] ymin = min(d$titv) @@ -38,7 +39,12 @@ abline(v=m75,lty=2) abline(v=m25,lty=2) points(novel$value,novel$titv,col="green",pch=20) points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} dev.off() # @@ -55,47 +61,63 @@ abline(v=m75,lty=2) abline(v=m25,lty=2) points(novel$value,novel$titv,col="green",pch=20) points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} dev.off() # -# Plot dbsnp rate as a function of the annotation +# Plot dbsnp and true positive rate as a function of the annotation # -outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="") +outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) -plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) -ma = all[all$value > m,] -mb = all[all$value < m,] -m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) -m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) -abline(v=m,lty=2) -abline(v=m75,lty=2) -abline(v=m25,lty=2) -dev.off() - -# -# Plot dbsnp rate as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14); axis(1,axTicks(1), format(axTicks(1), scientific=F)) abline(v=m,lty=2) abline(v=m75,lty=2) abline(v=m25,lty=2) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(all$value,all$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.", annotationName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2) +abline(v=m75,lty=2) +abline(v=m25,lty=2) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(all$value,all$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} dev.off() # # Plot histogram of the annotation's value # -outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="") +outfile = paste(input, "annotationHistogram.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14); diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 61b73bc7b..ff526f267 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -99,7 +99,7 @@ public abstract class TraversalEngine { (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads)); logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates)); - logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary)); + logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary)); logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments)); logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java index 4863fe5a8..2527ad2a8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java @@ -84,13 +84,23 @@ public class AnalyzeAnnotationsWalker extends RodWalker { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - // Add each VCF record to each annotation's data structure if( tracker != null ) { + + // First find out if this variant is in the truth set + boolean isTrueVariant = false; for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { - if( rod != null && rod instanceof RodVCF ) { - RodVCF variant = (RodVCF) rod; + if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) { + isTrueVariant = true; + break; // at least one of the truth files has this variant, so break out + } + } + + // Add each annotation in this VCF Record to the dataManager + for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { + if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) { + final RodVCF variant = (RodVCF) rod; if( variant.isSNP() ) { - dataManager.addAnnotations( variant, SAMPLE_NAME ); + dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant ); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java index 4d23c3d2c..5db86f689 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -48,7 +48,7 @@ public class AnnotationDataManager { data = new HashMap>(); } - public void addAnnotations( final RodVCF variant, final String sampleName ) { + public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) { if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out @@ -80,7 +80,6 @@ public class AnnotationDataManager { } final boolean isNovelVariant = variant.getID().equals("."); - final boolean isTrueVariant = false; //BUGBUG: Check truth input file to see if this variant is in the truth set // Decide if the variant is a transition or transversion if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) { @@ -109,36 +108,37 @@ public class AnnotationDataManager { } // Output a header line - output.println("value\ttitv\tdbsnp\tnumVariants\tcategory"); + output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory"); // Bin SNPs and calculate truth metrics for each bin thisAnnotationBin.clearBin(); for( AnnotationDatum datum : data.get( annotationKey ) ) { thisAnnotationBin.combine( datum ); if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); thisAnnotationBin.clearBin(); } // else, continue accumulating variants because this bin isn't full yet } if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + - "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); thisAnnotationBin.clearBin(); - } // Close the PrintStream output.close(); // Print out the command line to make it clear to the user what is being executed and how one might modify it - final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + + final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " + OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN; System.out.println( rScriptCommandLine ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java index 186e9e1a4..af8f66953 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java @@ -41,17 +41,18 @@ public class AnnotationDatum implements Comparator { private final int[] ti; private final int[] tv; - public static final int FULL_SET = -1; - public static final int NOVEL_SET = 0; - public static final int DBSNP_SET = 1; - public static final int TRUTH_SET = 2; + public static final int FULL_SET = 0; + public static final int NOVEL_SET = 1; + public static final int DBSNP_SET = 2; + public static final int TRUTH_SET = 3; + private static final int NUM_SETS = 4; public AnnotationDatum() { value = 0.0f; - ti = new int[3]; - tv = new int[3]; - for( int iii = 0; iii < 3; iii++ ) { + ti = new int[NUM_SETS]; + tv = new int[NUM_SETS]; + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; } @@ -60,9 +61,9 @@ public class AnnotationDatum implements Comparator { public AnnotationDatum( float _value ) { value = _value; - ti = new int[3]; - tv = new int[3]; - for( int iii = 0; iii < 3; iii++ ) { + ti = new int[NUM_SETS]; + tv = new int[NUM_SETS]; + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; } @@ -70,6 +71,7 @@ public class AnnotationDatum implements Comparator { final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) { + ti[FULL_SET]++; if( isNovelVariant ) { ti[NOVEL_SET]++; } else { // Is known, in DBsnp @@ -82,6 +84,7 @@ public class AnnotationDatum implements Comparator { final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) { + tv[FULL_SET]++; if( isNovelVariant ) { tv[NOVEL_SET]++; } else { // Is known, in DBsnp @@ -94,7 +97,7 @@ public class AnnotationDatum implements Comparator { final public void combine( final AnnotationDatum that ) { - for( int iii = 0; iii < 3; iii++ ) { + for( int iii = 0; iii < NUM_SETS; iii++ ) { this.ti[iii] += that.ti[iii]; this.tv[iii] += that.tv[iii]; } @@ -103,61 +106,44 @@ public class AnnotationDatum implements Comparator { final public float calcTiTv( final int INDEX ) { - if( INDEX == FULL_SET ) { - if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) { - throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); - } - - if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero - return 0.0f; - } - - return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) / - ((float) (tv[NOVEL_SET] + tv[DBSNP_SET])); - } else { - if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { - throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); - } - - if( tv[INDEX] == 0 ) { // Don't divide by zero - return 0.0f; - } - - return ((float) ti[INDEX]) / ((float) tv[INDEX]); + if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { + throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); } + + if( tv[INDEX] == 0 ) { // Don't divide by zero + return 0.0f; + } + + return ((float) ti[INDEX]) / ((float) tv[INDEX]); } final public float calcDBsnpRate() { - if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero + if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero return 0.0f; } return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) / - ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); + ((float) ti[FULL_SET] + tv[FULL_SET]); } final public float calcTPrate() { - if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero + if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero return 0.0f; } return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) / - ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); + ((float) ti[FULL_SET] + tv[FULL_SET]); } final public int numVariants( final int INDEX ) { - if( INDEX == FULL_SET ) { - return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]; - } else { - return ti[INDEX] + tv[INDEX]; - } + return ti[INDEX] + tv[INDEX]; } final public void clearBin() { value = 0.0f; - for( int iii = 0; iii < 3; iii++ ) { + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; }