From a11503819a067583c520b949b7e65f48e0cca191 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 22 Jan 2010 19:00:23 +0000 Subject: [PATCH] AnalyzeAnnotations now breaks out its TiTv plots into novel SNPs, dbSNP sites, and combined. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2659 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_Annotations_BinnedTiTv.R | 26 +++- .../AnnotationDataManager.java | 121 ++++++++++++++++-- 2 files changed, 134 insertions(+), 13 deletions(-) diff --git a/R/plot_Annotations_BinnedTiTv.R b/R/plot_Annotations_BinnedTiTv.R index 01490b4bd..ee74e8c55 100644 --- a/R/plot_Annotations_BinnedTiTv.R +++ b/R/plot_Annotations_BinnedTiTv.R @@ -15,7 +15,13 @@ c <- read.table(input, header=T) # Plot TiTv ratio as a function of the annotation # -gt = c[c$numVariants>minBinCutoff,] +gt = c[c$numVariants>minBinCutoff & c$category==0,] +novel = c[c$numVariants>minBinCutoff & c$category==1,] +dbsnp = c[c$numVariants>minBinCutoff & c$category==2,] + +d = c[c$numVariants>minBinCutoff,] +cmin = min(d$titv) +cmax = max(d$titv) outfile = paste(outputDir, "binnedTiTv.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) @@ -31,6 +37,24 @@ abline(v=m75,lty=2) abline(v=m25,lty=2) dev.off() +outfile = paste(outputDir, "binnedTiTv_novel.", annotationName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(gt$value,gt$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20,yim=c(cmin,cmax)); +m = weighted.mean(gt$value,gt$numVariants/sum(gt$numVariants)) +ma = gt[gt$value > m,] +mb = gt[gt$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +abline(v=m,lty=2) +abline(v=m75,lty=2) +abline(v=m25,lty=2) +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +legend("topleft", c("overall","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +dev.off() + + outfile = paste(outputDir, "binnedTiTv_quartiles.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java index 2638adfb8..2ecff405b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -41,10 +41,16 @@ import java.io.FileNotFoundException; public class AnnotationDataManager { - public final HashMap> data; + public final HashMap> dataFull; + public final HashMap> dataNovel; + public final HashMap> dataDBsnp; + public final HashMap> dataTruthSet; public AnnotationDataManager() { - data = new HashMap>(); + dataFull = new HashMap>(); + dataNovel = new HashMap>(); + dataDBsnp = new HashMap>(); + dataTruthSet = new HashMap>(); } public void addAnnotations( RodVCF variant ) { @@ -57,10 +63,51 @@ public class AnnotationDataManager { } final float value = Float.parseFloat( infoField.get( annotationKey ) ); - TreeSet treeSet = data.get( annotationKey ); + if( variant.getID().equals(".") ) { // This is a novel variant + TreeSet treeSet = dataNovel.get( annotationKey ); + if( treeSet == null ) { // This annotation hasn't been seen before + treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation + dataNovel.put( annotationKey, treeSet ); + } + AnnotationDatum datum = new AnnotationDatum( value ); + if( treeSet.contains(datum) ) { + datum = treeSet.tailSet(datum).first(); + } else { + treeSet.add(datum); + } + + // Decide if it was a Ti or a Tv + if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { + datum.incrementTi(); + } else { + datum.incrementTv(); + } + } else { // This is a DBsnp variant + TreeSet treeSet = dataDBsnp.get( annotationKey ); + if( treeSet == null ) { // This annotation hasn't been seen before + treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation + dataDBsnp.put( annotationKey, treeSet ); + } + AnnotationDatum datum = new AnnotationDatum( value ); + if( treeSet.contains(datum) ) { + datum = treeSet.tailSet(datum).first(); + } else { + treeSet.add(datum); + } + + // Decide if it was a Ti or a Tv + if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { + datum.incrementTi(); + } else { + datum.incrementTv(); + } + } + + // the overall set, containing both novel and DBsnp variants + TreeSet treeSet = dataFull.get( annotationKey ); if( treeSet == null ) { // This annotation hasn't been seen before treeSet = new TreeSet( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation - data.put( annotationKey, treeSet ); + dataFull.put( annotationKey, treeSet ); } AnnotationDatum datum = new AnnotationDatum( value ); if( treeSet.contains(datum) ) { @@ -81,7 +128,7 @@ public class AnnotationDataManager { public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR, final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) { - for( String annotationKey: data.keySet() ) { + for( String annotationKey: dataFull.keySet() ) { /* PrintStream output; @@ -130,13 +177,13 @@ public class AnnotationDataManager { } // Output a header line - output.println("value\ttitv\tnumVariants\t"); + output.println("value\ttitv\tnumVariants\tcategory"); // Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv int numTi = 0; int numTv = 0; AnnotationDatum lastDatum = null; - for( AnnotationDatum datum : data.get( annotationKey ) ) { + for( AnnotationDatum datum : dataFull.get( annotationKey ) ) { numTi += datum.numTransitions; numTv += datum.numTransversions; lastDatum = datum; @@ -144,7 +191,7 @@ public class AnnotationDataManager { float titv; if( numTv == 0) { titv = 0.0f; } else { titv = ((float) numTi) / ((float) numTv); } - output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv)); + output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0"); numTi = 0; numTv = 0; } @@ -154,9 +201,59 @@ public class AnnotationDataManager { float titv; if( numTv == 0) { titv = 0.0f; } else { titv = ((float) numTi) / ((float) numTv); } - output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)); + output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0"); } + + numTi = 0; + numTv = 0; + lastDatum = null; + for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) { + numTi += datum.numTransitions; + numTv += datum.numTransversions; + lastDatum = datum; + if( numTi + numTv >= MAX_VARIANTS_PER_BIN/3 ) { // This annotation bin is full + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1"); + numTi = 0; + numTv = 0; + } + // else, continue accumulating variants because this bin isn't full yet + } + if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1"); + } + + numTi = 0; + numTv = 0; + lastDatum = null; + for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) { + numTi += datum.numTransitions; + numTv += datum.numTransversions; + lastDatum = datum; + if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2"); + numTi = 0; + numTv = 0; + } + // else, continue accumulating variants because this bin isn't full yet + } + if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2"); + } + + /* System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " + OUTPUT_DIR + annotationKey + ".cumulative.dat" + " " + OUTPUT_DIR + " " + annotationKey); @@ -169,9 +266,9 @@ public class AnnotationDataManager { } */ - //System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + - // OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey + - // " " + MIN_VARIANTS_PER_BIN); + System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + + OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey + + " " + MIN_VARIANTS_PER_BIN); // Execute the RScript command to plot the table of TiTv values try {