From fc4285f9fda02849c6822d6caa1235d782ba6ee9 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 27 Jan 2010 19:30:31 +0000 Subject: [PATCH] AnalyzeAnnotations seems to be popular so I've rewritten the guts to be easier to extend and maintain. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2707 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_Annotations_BinnedTiTv.R | 6 +- .../gatk/iterators/LocusOverflowTracker.java | 4 +- .../AnnotationDataManager.java | 159 ++++-------------- .../variantoptimizer/AnnotationDatum.java | 129 +++++++++++--- 4 files changed, 140 insertions(+), 158 deletions(-) diff --git a/R/plot_Annotations_BinnedTiTv.R b/R/plot_Annotations_BinnedTiTv.R index 99596aa5e..10173b27d 100644 --- a/R/plot_Annotations_BinnedTiTv.R +++ b/R/plot_Annotations_BinnedTiTv.R @@ -14,9 +14,9 @@ c <- read.table(input, header=T) # Plot TiTv ratio as a function of the annotation # -all = c[c$numVariants>minBinCutoff & c$category==0,] -novel = c[c$numVariants>minBinCutoff & c$category==1,] -dbsnp = c[c$numVariants>minBinCutoff & c$category==2,] +all = c[c$numVariants>minBinCutoff & c$category=="all",] +novel = c[c$numVariants>minBinCutoff & c$category=="novel",] +dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] d = c[c$numVariants>minBinCutoff,] ymin = min(d$titv) diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusOverflowTracker.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusOverflowTracker.java index d41e66751..08d2cb2cb 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusOverflowTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusOverflowTracker.java @@ -93,10 +93,10 @@ public class LocusOverflowTracker { // is the last warning they'll see if (warningsEmitted < MAX_WARNINGS) { warningsEmitted++; - Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation); + Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation); } else if (warningsEmitted == MAX_WARNINGS) { warningsEmitted++; - Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation + + Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation + "; the maximum warning count has been reached, we will no longer emit warnings of this nature!!"); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java index d022b66e8..5d467669e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -41,22 +41,16 @@ import java.io.FileNotFoundException; public class AnnotationDataManager { - public final HashMap> dataFull; - public final HashMap> dataNovel; - public final HashMap> dataDBsnp; - public final HashMap> dataTruthSet; + public final HashMap> data; public AnnotationDataManager() { - dataFull = new HashMap>(); - dataNovel = new HashMap>(); - dataDBsnp = new HashMap>(); - dataTruthSet = new HashMap>(); + data = new HashMap>(); } public void addAnnotations( RodVCF variant, String sampleName ) { - if( sampleName != null ) { // only process variants that are found in the sample with this sampleName - if( variant.getGenotype(sampleName).isNoCall() ) { // this variant isn't found in this sample so break out + if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName + if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out return; } } // else, process all samples @@ -69,58 +63,17 @@ public class AnnotationDataManager { boolean skipThisAnnotation = false; try { value = Float.parseFloat( infoField.get( annotationKey ) ); - } catch( Exception e ) { // BUGBUG: Make this exception more specific. NumberFormatException?? - skipThisAnnotation = true; // skip over annotations that aren't floats, like "AC"?? and "DB" + } catch( NumberFormatException e ) { + skipThisAnnotation = true; // Skip over annotations that aren't floats, like older versions of "AC" and "DB" } if( skipThisAnnotation ) { - continue; // skip over annotations that aren't floats, like "AC"?? and "DB" + continue; // Skip over annotations that aren't floats, like older versions of "AC" and "DB" } - if( variant.getID().equals(".") ) { // This is a novel variant - TreeSet treeSet = dataNovel.get( annotationKey ); - if( treeSet == null ) { // This annotation hasn't been seen before - treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation - dataNovel.put( annotationKey, treeSet ); - } - AnnotationDatum datum = new AnnotationDatum( value ); - if( treeSet.contains(datum) ) { - datum = treeSet.tailSet(datum).first(); - } else { - treeSet.add(datum); - } - - // Decide if it was a Ti or a Tv - if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { - datum.incrementTi(); - } else { - datum.incrementTv(); - } - } else { // This is a DBsnp variant - TreeSet treeSet = dataDBsnp.get( annotationKey ); - if( treeSet == null ) { // This annotation hasn't been seen before - treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation - dataDBsnp.put( annotationKey, treeSet ); - } - AnnotationDatum datum = new AnnotationDatum( value ); - if( treeSet.contains(datum) ) { - datum = treeSet.tailSet(datum).first(); - } else { - treeSet.add(datum); - } - - // Decide if it was a Ti or a Tv - if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { - datum.incrementTi(); - } else { - datum.incrementTv(); - } - } - - // the overall set, containing both novel and DBsnp variants - TreeSet treeSet = dataFull.get( annotationKey ); + TreeSet treeSet = data.get( annotationKey ); if( treeSet == null ) { // This annotation hasn't been seen before treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation - dataFull.put( annotationKey, treeSet ); + data.put( annotationKey, treeSet ); } AnnotationDatum datum = new AnnotationDatum( value ); if( treeSet.contains(datum) ) { @@ -129,11 +82,14 @@ public class AnnotationDataManager { treeSet.add(datum); } + final boolean isNovelVariant = variant.getID().equals("."); + final boolean isTrueVariant = false; + // Decide if it was a Ti or a Tv if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { - datum.incrementTi(); + datum.incrementTi( isNovelVariant, isTrueVariant ); } else { - datum.incrementTv(); + datum.incrementTv( isNovelVariant, isTrueVariant ); } } } @@ -143,12 +99,13 @@ public class AnnotationDataManager { System.out.println( "\nExecuting RScript commands:" ); - for( String annotationKey: dataFull.keySet() ) { + // For each annotation we've seen + for( String annotationKey : data.keySet() ) { PrintStream output; try { - output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); - } catch (FileNotFoundException e) { + output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation + } catch ( FileNotFoundException e ) { throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " + OUTPUT_PREFIX + annotationKey + ".dat"); } @@ -157,77 +114,23 @@ public class AnnotationDataManager { output.println("value\ttitv\tnumVariants\tcategory"); // Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv - int numTi = 0; - int numTv = 0; - AnnotationDatum lastDatum = null; - for( AnnotationDatum datum : dataFull.get( annotationKey ) ) { - numTi += datum.numTransitions; - numTv += datum.numTransversions; - lastDatum = datum; - if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0"); - numTi = 0; - numTv = 0; + AnnotationDatum thisAnnotationBin = new AnnotationDatum(); + for( AnnotationDatum datum : data.get( annotationKey ) ) { + thisAnnotationBin.combine( datum ); + if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + thisAnnotationBin.clearBin(); } // else, continue accumulating variants because this bin isn't full yet } - if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0"); - } + if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // one final bin that may not have been dumped out + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); - numTi = 0; - numTv = 0; - lastDatum = null; - for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) { - numTi += datum.numTransitions; - numTv += datum.numTransversions; - lastDatum = datum; - if( numTi + numTv >= MAX_VARIANTS_PER_BIN/2 ) { // This annotation bin is full - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1"); - numTi = 0; - numTv = 0; - } - // else, continue accumulating variants because this bin isn't full yet - } - if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1"); - } - - numTi = 0; - numTv = 0; - lastDatum = null; - for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) { - numTi += datum.numTransitions; - numTv += datum.numTransversions; - lastDatum = datum; - if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2"); - numTi = 0; - numTv = 0; - } - // else, continue accumulating variants because this bin isn't full yet - } - if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out - float titv; - if( numTv == 0) { titv = 0.0f; } - else { titv = ((float) numTi) / ((float) numTv); } - output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2"); } output.close(); @@ -239,7 +142,7 @@ public class AnnotationDataManager { try { final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN); - } catch (Exception e) { + } catch ( Exception e ) { throw new StingException( "Unable to execute RScript command" ); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java index 7512938eb..8dfdde197 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java @@ -37,50 +37,129 @@ import java.util.Comparator; public class AnnotationDatum implements Comparator { - public final float value; - public int numTransitions; - public int numTransversions; + public float value; + private final int[] ti; + private final int[] tv; + + public static final int FULL_SET = -1; + public static final int NOVEL_SET = 0; + public static final int DBSNP_SET = 1; + public static final int TRUTH_SET = 2; public AnnotationDatum() { + value = 0.0f; - numTransitions = 0; - numTransversions = 0; + ti = new int[3]; + tv = new int[3]; + for( int iii = 0; iii < 3; iii++ ) { + ti[iii] = 0; + tv[iii] = 0; + } } public AnnotationDatum( float _value ) { + value = _value; - numTransitions = 0; - numTransversions = 0; - } - - final public void incrementTi() { - numTransitions++; - } - - final public void incrementTv() { - numTransversions++; - } - - final public float calcTiTv() { - - if( numTransitions < 0 || numTransversions < 0) { - throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); + ti = new int[3]; + tv = new int[3]; + for( int iii = 0; iii < 3; iii++ ) { + ti[iii] = 0; + tv[iii] = 0; } + } - if( numTransversions == 0) { + final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) { + + if( isNovelVariant ) { + ti[NOVEL_SET]++; + } else { // Is known, in DBsnp + ti[DBSNP_SET]++; + } + if( isTrueVariant ) { + ti[TRUTH_SET]++; + } + } + + final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) { + + if( isNovelVariant ) { + tv[NOVEL_SET]++; + } else { // Is known, in DBsnp + tv[DBSNP_SET]++; + } + if( isTrueVariant ) { + tv[TRUTH_SET]++; + } + } + + final public void combine( final AnnotationDatum that ) { + + for( int iii = 0; iii < 3; iii++ ) { + this.ti[iii] += that.ti[iii]; + this.tv[iii] += that.tv[iii]; + } + this.value = that.value; // overwrite this bin's value + } + + final public float calcTiTv( final int INDEX ) { + + if( INDEX == FULL_SET ) { + if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) { + throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); + } + + if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero + return 0.0f; + } + + return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) / + ((float) (tv[NOVEL_SET] + tv[DBSNP_SET])); + } else { + if( ti[INDEX] < 0 || tv[INDEX] < 0 ) { + throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); + } + + if( tv[INDEX] == 0 ) { // Don't divide by zero + return 0.0f; + } + + return ((float) ti[INDEX]) / ((float) tv[INDEX]); + } + } + + final public float calcTPrate() { + + if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero return 0.0f; } - return ((float) numTransitions) / ((float) numTransversions); + return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) / + ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); } - public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { + final public int numVariants( final int INDEX ) { + if( INDEX == FULL_SET ) { + return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]; + } else { + return ti[INDEX] + tv[INDEX]; + } + } + + final public void clearBin() { + value = 0.0f; + for( int iii = 0; iii < 3; iii++ ) { + ti[iii] = 0; + tv[iii] = 0; + } + } + + public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator if( a1.value < a2.value ) { return -1; } if( a1.value > a2.value ) { return 1; } return 0; } - public int equals( AnnotationDatum that ) { + public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet if( this.value < that.value ) { return -1; } if( this.value > that.value ) { return 1; } return 0;