From b8ae083d1bb4cd10b9f5d89a3874b345c581277a Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 27 Jan 2010 21:08:33 +0000 Subject: [PATCH] AnalyzeAnnotations creates a plot of dbsnp rate as a function of the annotations. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2711 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_Annotations_BinnedTiTv.R | 55 ++++++++++++++++--- .../AnnotationDataManager.java | 23 ++++---- .../variantoptimizer/AnnotationDatum.java | 12 +++- 3 files changed, 71 insertions(+), 19 deletions(-) diff --git a/R/plot_Annotations_BinnedTiTv.R b/R/plot_Annotations_BinnedTiTv.R index 10173b27d..31d139559 100644 --- a/R/plot_Annotations_BinnedTiTv.R +++ b/R/plot_Annotations_BinnedTiTv.R @@ -7,13 +7,8 @@ input = args[1] annotationName = args[2] minBinCutoff = as.numeric(args[3]) - c <- read.table(input, header=T) -# -# Plot TiTv ratio as a function of the annotation -# - all = c[c$numVariants>minBinCutoff & c$category=="all",] novel = c[c$numVariants>minBinCutoff & c$category=="novel",] dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] @@ -24,10 +19,14 @@ ymax = max(d$titv) xmin = min(d$value) xmax = max(d$value) +# +# Plot TiTv ratio as a function of the annotation +# + outfile = paste(input, ".TiTv.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); axis(1,axTicks(1), format(axTicks(1), scientific=F)) m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) ma = all[all$value > m,] @@ -42,10 +41,14 @@ points(dbsnp$value,dbsnp$titv,col="blue",pch=20) legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) dev.off() +# +# Plot TiTv ratio as a function of the annotation, log scale on the x-axis +# + outfile = paste(input, ".TiTv_log.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) -plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); axis(1,axTicks(1), format(axTicks(1), scientific=F)) abline(v=m,lty=2) abline(v=m75,lty=2) @@ -55,10 +58,46 @@ points(dbsnp$value,dbsnp$titv,col="blue",pch=20) legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) dev.off() +# +# Plot dbsnp rate as a function of the annotation +# + +outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) +ma = all[all$value > m,] +mb = all[all$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +abline(v=m,lty=2) +abline(v=m75,lty=2) +abline(v=m25,lty=2) +dev.off() + +# +# Plot dbsnp rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2) +abline(v=m75,lty=2) +abline(v=m25,lty=2) +dev.off() + +# +# Plot histogram of the annotation's value +# outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) -plot(all$value,all$numVariants,xlab=annotationName,ylab="num variants in bin",type="h",xaxt="n",ps=14); +plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14); axis(1,axTicks(1), format(axTicks(1), scientific=F)) dev.off() \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java index 74a88d3c2..c28cb020b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -73,7 +73,7 @@ public class AnnotationDataManager { data.put( annotationKey, treeSet ); } AnnotationDatum datum = new AnnotationDatum( value ); - if( treeSet.contains(datum) ) { // contains uses AnnotationDatum's equals function, so it only checks if the value field is already present + if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present datum = treeSet.tailSet(datum).first(); } else { treeSet.add(datum); @@ -95,39 +95,42 @@ public class AnnotationDataManager { final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) { final AnnotationDatum thisAnnotationBin = new AnnotationDatum(); - System.out.println( "\nExecuting RScript commands:" ); + System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" ); // For each annotation we've seen for( String annotationKey : data.keySet() ) { PrintStream output; try { - output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation + output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the intermediate data file for this annotation } catch ( FileNotFoundException e ) { throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " + OUTPUT_PREFIX + annotationKey + ".dat"); } // Output a header line - output.println("value\ttitv\tnumVariants\tcategory"); + output.println("value\ttitv\tdbsnp\tnumVariants\tcategory"); // Bin SNPs and calculate truth metrics for each bin thisAnnotationBin.clearBin(); for( AnnotationDatum datum : data.get( annotationKey ) ) { thisAnnotationBin.combine( datum ); if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); thisAnnotationBin.clearBin(); } // else, continue accumulating variants because this bin isn't full yet } if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); - output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel"); + output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp"); + thisAnnotationBin.clearBin(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java index 33b0dd096..186e9e1a4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java @@ -127,13 +127,23 @@ public class AnnotationDatum implements Comparator { } } + final public float calcDBsnpRate() { + + if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero + return 0.0f; + } + + return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) / + ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); + } + final public float calcTPrate() { if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero return 0.0f; } - return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) / + return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) / ((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]); }