From 79c4cc1db7ac3da56db8e039a76c75b6836d61be Mon Sep 17 00:00:00 2001 From: rpoplin Date: Thu, 28 Jan 2010 21:41:23 +0000 Subject: [PATCH] AnalyzeAnnotations now breaks out titv by calls in hapmap and also plots true positive rates. Any RODs passed in whose name starts with 'truth' is considered to be the truth set. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2726 348d0f76-0448-11de-a6fe-93d51630548a --- ... => plot_Annotations_BinnedTruthMetrics.R} | 68 ++++++++++++------ .../gatk/traversals/TraversalEngine.java | 2 +- .../AnalyzeAnnotationsWalker.java | 18 +++-- .../AnnotationDataManager.java | 24 +++---- .../variantoptimizer/AnnotationDatum.java | 70 ++++++++----------- 5 files changed, 100 insertions(+), 82 deletions(-) rename R/{plot_Annotations_BinnedTiTv.R => plot_Annotations_BinnedTruthMetrics.R} (59%) diff --git a/R/plot_Annotations_BinnedTiTv.R b/R/plot_Annotations_BinnedTruthMetrics.R similarity index 59% rename from R/plot_Annotations_BinnedTiTv.R rename to R/plot_Annotations_BinnedTruthMetrics.R index 31d139559..182edfb2c 100644 --- a/R/plot_Annotations_BinnedTiTv.R +++ b/R/plot_Annotations_BinnedTruthMetrics.R @@ -12,6 +12,7 @@ c <- read.table(input, header=T) all = c[c$numVariants>minBinCutoff & c$category=="all",] novel = c[c$numVariants>minBinCutoff & c$category=="novel",] dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] d = c[c$numVariants>minBinCutoff,] ymin = min(d$titv) @@ -38,7 +39,12 @@ abline(v=m75,lty=2) abline(v=m25,lty=2) points(novel$value,novel$titv,col="green",pch=20) points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} dev.off() # @@ -55,47 +61,63 @@ abline(v=m75,lty=2) abline(v=m25,lty=2) points(novel$value,novel$titv,col="green",pch=20) points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} dev.off() # -# Plot dbsnp rate as a function of the annotation +# Plot dbsnp and true positive rate as a function of the annotation # -outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="") +outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) -plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); -axis(1,axTicks(1), format(axTicks(1), scientific=F)) -m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) -ma = all[all$value > m,] -mb = all[all$value < m,] -m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) -m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) -abline(v=m,lty=2) -abline(v=m75,lty=2) -abline(v=m25,lty=2) -dev.off() - -# -# Plot dbsnp rate as a function of the annotation, log scale on the x-axis -# - -outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="") -pdf(outfile, height=7, width=7) -par(cex=1.1) -plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14); axis(1,axTicks(1), format(axTicks(1), scientific=F)) abline(v=m,lty=2) abline(v=m75,lty=2) abline(v=m25,lty=2) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(all$value,all$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.", annotationName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2) +abline(v=m75,lty=2) +abline(v=m25,lty=2) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(all$value,all$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} dev.off() # # Plot histogram of the annotation's value # -outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="") +outfile = paste(input, "annotationHistogram.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14); diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 61b73bc7b..ff526f267 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -99,7 +99,7 @@ public abstract class TraversalEngine { (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads)); logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates)); - logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary)); + logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary)); logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments)); logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java index 4863fe5a8..2527ad2a8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java @@ -84,13 +84,23 @@ public class AnalyzeAnnotationsWalker extends RodWalker { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - // Add each VCF record to each annotation's data structure if( tracker != null ) { + + // First find out if this variant is in the truth set + boolean isTrueVariant = false; for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { - if( rod != null && rod instanceof RodVCF ) { - RodVCF variant = (RodVCF) rod; + if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) { + isTrueVariant = true; + break; // at least one of the truth files has this variant, so break out + } + } + + // Add each annotation in this VCF Record to the dataManager + for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { + if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) { + final RodVCF variant = (RodVCF) rod; if( variant.isSNP() ) { - dataManager.addAnnotations( variant, SAMPLE_NAME ); + dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant ); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java index 4d23c3d2c..5db86f689 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -48,7 +48,7 @@ public class AnnotationDataManager { data = new HashMap>(); } - public void addAnnotations( final RodVCF variant, final String sampleName ) { + public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) { if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out @@ -80,7 +80,6 @@ public class AnnotationDataManager { } final boolean isNovelVariant = variant.getID().equals("."); - final boolean isTrueVariant = false; //BUGBUG: Check truth input file to see if this variant is in the truth set // Decide if the variant is a transition or transversion if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) { @@ -109,36 +108,37 @@ public class AnnotationDataManager { } // Output a header line - output.println("value\ttitv\tdbsnp\tnumVariants\tcategory"); + output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory"); // Bin SNPs and calculate truth metrics for each bin thisAnnotationBin.clearBin(); for( AnnotationDatum datum : data.get( annotationKey ) ) { thisAnnotationBin.combine( datum ); if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); thisAnnotationBin.clearBin(); } // else, continue accumulating variants because this bin isn't full yet } if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + - "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() + + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth"); thisAnnotationBin.clearBin(); - } // Close the PrintStream output.close(); // Print out the command line to make it clear to the user what is being executed and how one might modify it - final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + + final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " + OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN; System.out.println( rScriptCommandLine ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java index 186e9e1a4..af8f66953 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java @@ -41,17 +41,18 @@ public class AnnotationDatum implements Comparator { private final int[] ti; private final int[] tv; - public static final int FULL_SET = -1; - public static final int NOVEL_SET = 0; - public static final int DBSNP_SET = 1; - public static final int TRUTH_SET = 2; + public static final int FULL_SET = 0; + public static final int NOVEL_SET = 1; + public static final int DBSNP_SET = 2; + public static final int TRUTH_SET = 3; + private static final int NUM_SETS = 4; public AnnotationDatum() { value = 0.0f; - ti = new int[3]; - tv = new int[3]; - for( int iii = 0; iii < 3; iii++ ) { + ti = new int[NUM_SETS]; + tv = new int[NUM_SETS]; + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; } @@ -60,9 +61,9 @@ public class AnnotationDatum implements Comparator { public AnnotationDatum( float _value ) { value = _value; - ti = new int[3]; - tv = new int[3]; - for( int iii = 0; iii < 3; iii++ ) { + ti = new int[NUM_SETS]; + tv = new int[NUM_SETS]; + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; } @@ -70,6 +71,7 @@ public class AnnotationDatum implements Comparator { final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) { + ti[FULL_SET]++; if( isNovelVariant ) { ti[NOVEL_SET]++; } else { // Is known, in DBsnp @@ -82,6 +84,7 @@ public class AnnotationDatum implements Comparator { final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) { + tv[FULL_SET]++; if( isNovelVariant ) { tv[NOVEL_SET]++; } else { // Is known, in DBsnp @@ -94,7 +97,7 @@ public class AnnotationDatum implements Comparator { final public void combine( final AnnotationDatum that ) { - for( int iii = 0; iii < 3; iii++ ) { + for( int iii = 0; iii < NUM_SETS; iii++ ) { this.ti[iii] += that.ti[iii]; this.tv[iii] += that.tv[iii]; } @@ -103,61 +106,44 @@ public class AnnotationDatum implements Comparator { final public float calcTiTv( final int INDEX ) { - if( INDEX == FULL_SET ) { - if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) { - throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); - } - - if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero - return 0.0f; - } - - return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) / - ((float) (tv[NOVEL_SET] + tv[DBSNP_SET])); - } else { - if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { - throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); - } - - if( tv[INDEX] == 0 ) { // Don't divide by zero - return 0.0f; - } - - return ((float) ti[INDEX]) / ((float) tv[INDEX]); + if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { + throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); } + + if( tv[INDEX] == 0 ) { // Don't divide by zero + return 0.0f; + } + + return ((float) ti[INDEX]) / ((float) tv[INDEX]); } final public float calcDBsnpRate() { - if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero + if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero return 0.0f; } return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) / - ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); + ((float) ti[FULL_SET] + tv[FULL_SET]); } final public float calcTPrate() { - if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero + if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero return 0.0f; } return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) / - ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); + ((float) ti[FULL_SET] + tv[FULL_SET]); } final public int numVariants( final int INDEX ) { - if( INDEX == FULL_SET ) { - return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]; - } else { - return ti[INDEX] + tv[INDEX]; - } + return ti[INDEX] + tv[INDEX]; } final public void clearBin() { value = 0.0f; - for( int iii = 0; iii < 3; iii++ ) { + for( int iii = 0; iii < NUM_SETS; iii++ ) { ti[iii] = 0; tv[iii] = 0; }