AnalyzeAnnotations now breaks out its TiTv plots into novel SNPs, dbSNP sites, and combined.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2659 348d0f76-0448-11de-a6fe-93d51630548a
2010-01-22 19:00:23 +00:00 · 2010-01-22 19:00:23 +00:00 · a11503819a
parent e00cb688ac
commit a11503819a
2 changed files with 134 additions and 13 deletions
--- a/R/plot_Annotations_BinnedTiTv.R
+++ b/R/plot_Annotations_BinnedTiTv.R
@ -15,7 +15,13 @@ c <- read.table(input, header=T)
 # Plot TiTv ratio as a function of the annotation
 #

-gt = c[c$numVariants>minBinCutoff,]
+gt = c[c$numVariants>minBinCutoff & c$category==0,]
+novel = c[c$numVariants>minBinCutoff & c$category==1,]
+dbsnp = c[c$numVariants>minBinCutoff & c$category==2,]
+
+d = c[c$numVariants>minBinCutoff,]
+cmin = min(d$titv)
+cmax = max(d$titv)

 outfile = paste(outputDir, "binnedTiTv.", annotationName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
@ -31,6 +37,24 @@ abline(v=m75,lty=2)
 abline(v=m25,lty=2)
 dev.off()

+outfile = paste(outputDir, "binnedTiTv_novel.", annotationName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(gt$value,gt$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20,yim=c(cmin,cmax));
+m = weighted.mean(gt$value,gt$numVariants/sum(gt$numVariants))
+ma = gt[gt$value > m,]
+mb = gt[gt$value < m,]
+m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
+m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
+abline(v=m,lty=2)
+abline(v=m75,lty=2)
+abline(v=m25,lty=2)
+points(novel$value,novel$titv,col="green",pch=20)
+points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+legend("topleft", c("overall","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+dev.off()
+
+
 outfile = paste(outputDir, "binnedTiTv_quartiles.", annotationName, ".pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
@ -41,10 +41,16 @@ import java.io.FileNotFoundException;

 public class AnnotationDataManager {

-    public final HashMap<String, TreeSet<AnnotationDatum>> data;
+    public final HashMap<String, TreeSet<AnnotationDatum>> dataFull;
+    public final HashMap<String, TreeSet<AnnotationDatum>> dataNovel;
+    public final HashMap<String, TreeSet<AnnotationDatum>> dataDBsnp;
+    public final HashMap<String, TreeSet<AnnotationDatum>> dataTruthSet;

    public AnnotationDataManager() {
-        data = new HashMap<String, TreeSet<AnnotationDatum>>();
+        dataFull = new HashMap<String, TreeSet<AnnotationDatum>>();
+        dataNovel = new HashMap<String, TreeSet<AnnotationDatum>>();
+        dataDBsnp = new HashMap<String, TreeSet<AnnotationDatum>>();
+        dataTruthSet = new HashMap<String, TreeSet<AnnotationDatum>>();
    }

    public void addAnnotations( RodVCF variant ) {
@ -57,10 +63,51 @@ public class AnnotationDataManager {
            }
            final float value = Float.parseFloat( infoField.get( annotationKey ) );

-            TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
+            if( variant.getID().equals(".") ) { // This is a novel variant
+                TreeSet<AnnotationDatum> treeSet = dataNovel.get( annotationKey );
+                if( treeSet == null ) { // This annotation hasn't been seen before
+                    treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
+                    dataNovel.put( annotationKey, treeSet );
+                }
+                AnnotationDatum datum = new AnnotationDatum( value );
+                if( treeSet.contains(datum) ) {
+                    datum = treeSet.tailSet(datum).first();
+                } else {
+                    treeSet.add(datum);
+                }
+
+                // Decide if it was a Ti or a Tv
+                if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
+                    datum.incrementTi();
+                } else {
+                    datum.incrementTv();
+                }
+            } else { // This is a DBsnp variant
+                TreeSet<AnnotationDatum> treeSet = dataDBsnp.get( annotationKey );
+                if( treeSet == null ) { // This annotation hasn't been seen before
+                    treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
+                    dataDBsnp.put( annotationKey, treeSet );
+                }
+                AnnotationDatum datum = new AnnotationDatum( value );
+                if( treeSet.contains(datum) ) {
+                    datum = treeSet.tailSet(datum).first();
+                } else {
+                    treeSet.add(datum);
+                }
+
+                // Decide if it was a Ti or a Tv
+                if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
+                    datum.incrementTi();
+                } else {
+                    datum.incrementTv();
+                }
+            }
+
+            // the overall set, containing both novel and DBsnp variants
+            TreeSet<AnnotationDatum> treeSet = dataFull.get( annotationKey );
            if( treeSet == null ) { // This annotation hasn't been seen before
                treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
-                data.put( annotationKey, treeSet );
+                dataFull.put( annotationKey, treeSet );
            }
            AnnotationDatum datum = new AnnotationDatum( value );
            if( treeSet.contains(datum) ) {
@ -81,7 +128,7 @@ public class AnnotationDataManager {
    public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR,
                                      final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {

-        for( String annotationKey: data.keySet() ) {
+        for( String annotationKey: dataFull.keySet() ) {

            /*
            PrintStream output;
@ -130,13 +177,13 @@ public class AnnotationDataManager {
            }

            // Output a header line
-            output.println("value\ttitv\tnumVariants\t");
+            output.println("value\ttitv\tnumVariants\tcategory");

            // Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv
            int numTi = 0;
            int numTv = 0;
            AnnotationDatum lastDatum = null;
-            for( AnnotationDatum datum : data.get( annotationKey ) ) {
+            for( AnnotationDatum datum : dataFull.get( annotationKey ) ) {
                numTi += datum.numTransitions;
                numTv += datum.numTransversions;
                lastDatum = datum;
@ -144,7 +191,7 @@ public class AnnotationDataManager {
                    float titv;
                    if( numTv == 0) { titv = 0.0f; }
                    else { titv = ((float) numTi) / ((float) numTv); }
-                    output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv));
+                    output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0");
                    numTi = 0;
                    numTv = 0;
                }
@ -154,9 +201,59 @@ public class AnnotationDataManager {
                float titv;
                if( numTv == 0) { titv = 0.0f; }
                else { titv = ((float) numTi) / ((float) numTv); }
-                output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv));
+                output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0");
            }

+
+            numTi = 0;
+            numTv = 0;
+            lastDatum = null;
+            for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) {
+                numTi += datum.numTransitions;
+                numTv += datum.numTransversions;
+                lastDatum = datum;
+                if( numTi + numTv >= MAX_VARIANTS_PER_BIN/3 ) { // This annotation bin is full
+                    float titv;
+                    if( numTv == 0) { titv = 0.0f; }
+                    else { titv = ((float) numTi) / ((float) numTv); }
+                    output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1");
+                    numTi = 0;
+                    numTv = 0;
+                }
+                // else, continue accumulating variants because this bin isn't full yet
+            }
+            if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
+                float titv;
+                if( numTv == 0) { titv = 0.0f; }
+                else { titv = ((float) numTi) / ((float) numTv); }
+                output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1");
+            }
+
+            numTi = 0;
+            numTv = 0;
+            lastDatum = null;
+            for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) {
+                numTi += datum.numTransitions;
+                numTv += datum.numTransversions;
+                lastDatum = datum;
+                if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
+                    float titv;
+                    if( numTv == 0) { titv = 0.0f; }
+                    else { titv = ((float) numTi) / ((float) numTv); }
+                    output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2");
+                    numTi = 0;
+                    numTv = 0;
+                }
+                // else, continue accumulating variants because this bin isn't full yet
+            }
+            if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
+                float titv;
+                if( numTv == 0) { titv = 0.0f; }
+                else { titv = ((float) numTi) / ((float) numTv); }
+                output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2");
+            }
+
+
            /*
            System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
                                        OUTPUT_DIR + annotationKey + ".cumulative.dat" + " " + OUTPUT_DIR + " " + annotationKey);
@ -169,9 +266,9 @@ public class AnnotationDataManager {
            }
            */

-            //System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
-            //                            OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey +
-            //                            " " + MIN_VARIANTS_PER_BIN);
+            System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
+                                        OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey +
+                                        " " + MIN_VARIANTS_PER_BIN);

            // Execute the RScript command to plot the table of TiTv values
            try {