AnalyzeAnnotations now breaks out titv by calls in hapmap and also plots true positive rates. Any RODs passed in whose name starts with 'truth' is considered to be the truth set.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2726 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-28 21:41:23 +00:00
parent 7a10c40fb3
commit 79c4cc1db7
5 changed files with 100 additions and 82 deletions

View File

@ -12,6 +12,7 @@ c <- read.table(input, header=T)
all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv)
@ -38,7 +39,12 @@ abline(v=m75,lty=2)
abline(v=m25,lty=2)
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
@ -55,47 +61,63 @@ abline(v=m75,lty=2)
abline(v=m25,lty=2)
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot dbsnp rate as a function of the annotation
# Plot dbsnp and true positive rate as a function of the annotation
#
outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="")
outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
ma = all[all$value > m,]
mb = all[all$value < m,]
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
abline(v=m,lty=2)
abline(v=m75,lty=2)
abline(v=m25,lty=2)
dev.off()
#
# Plot dbsnp rate as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2)
abline(v=m75,lty=2)
abline(v=m25,lty=2)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(all$value,all$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".truthRate_log.", annotationName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2)
abline(v=m75,lty=2)
abline(v=m25,lty=2)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(all$value,all$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot histogram of the annotation's value
#
outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="")
outfile = paste(input, "annotationHistogram.", annotationName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14);

View File

@ -99,7 +99,7 @@ public abstract class TraversalEngine {
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary));
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
}

View File

@ -84,13 +84,23 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// Add each VCF record to each annotation's data structure
if( tracker != null ) {
// First find out if this variant is in the truth set
boolean isTrueVariant = false;
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod instanceof RodVCF ) {
RodVCF variant = (RodVCF) rod;
if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) {
isTrueVariant = true;
break; // at least one of the truth files has this variant, so break out
}
}
// Add each annotation in this VCF Record to the dataManager
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
final RodVCF variant = (RodVCF) rod;
if( variant.isSNP() ) {
dataManager.addAnnotations( variant, SAMPLE_NAME );
dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant );
}
}
}

View File

@ -48,7 +48,7 @@ public class AnnotationDataManager {
data = new HashMap<String, TreeSet<AnnotationDatum>>();
}
public void addAnnotations( final RodVCF variant, final String sampleName ) {
public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) {
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
@ -80,7 +80,6 @@ public class AnnotationDataManager {
}
final boolean isNovelVariant = variant.getID().equals(".");
final boolean isTrueVariant = false; //BUGBUG: Check truth input file to see if this variant is in the truth set
// Decide if the variant is a transition or transversion
if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) {
@ -109,36 +108,37 @@ public class AnnotationDataManager {
}
// Output a header line
output.println("value\ttitv\tdbsnp\tnumVariants\tcategory");
output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory");
// Bin SNPs and calculate truth metrics for each bin
thisAnnotationBin.clearBin();
for( AnnotationDatum datum : data.get( annotationKey ) ) {
thisAnnotationBin.combine( datum );
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() +
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
thisAnnotationBin.clearBin();
}
// else, continue accumulating variants because this bin isn't full yet
}
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() +
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
thisAnnotationBin.clearBin();
}
// Close the PrintStream
output.close();
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN;
System.out.println( rScriptCommandLine );

View File

@ -41,17 +41,18 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
private final int[] ti;
private final int[] tv;
public static final int FULL_SET = -1;
public static final int NOVEL_SET = 0;
public static final int DBSNP_SET = 1;
public static final int TRUTH_SET = 2;
public static final int FULL_SET = 0;
public static final int NOVEL_SET = 1;
public static final int DBSNP_SET = 2;
public static final int TRUTH_SET = 3;
private static final int NUM_SETS = 4;
public AnnotationDatum() {
value = 0.0f;
ti = new int[3];
tv = new int[3];
for( int iii = 0; iii < 3; iii++ ) {
ti = new int[NUM_SETS];
tv = new int[NUM_SETS];
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
@ -60,9 +61,9 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
public AnnotationDatum( float _value ) {
value = _value;
ti = new int[3];
tv = new int[3];
for( int iii = 0; iii < 3; iii++ ) {
ti = new int[NUM_SETS];
tv = new int[NUM_SETS];
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
@ -70,6 +71,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
ti[FULL_SET]++;
if( isNovelVariant ) {
ti[NOVEL_SET]++;
} else { // Is known, in DBsnp
@ -82,6 +84,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
tv[FULL_SET]++;
if( isNovelVariant ) {
tv[NOVEL_SET]++;
} else { // Is known, in DBsnp
@ -94,7 +97,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
final public void combine( final AnnotationDatum that ) {
for( int iii = 0; iii < 3; iii++ ) {
for( int iii = 0; iii < NUM_SETS; iii++ ) {
this.ti[iii] += that.ti[iii];
this.tv[iii] += that.tv[iii];
}
@ -103,61 +106,44 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
final public float calcTiTv( final int INDEX ) {
if( INDEX == FULL_SET ) {
if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) /
((float) (tv[NOVEL_SET] + tv[DBSNP_SET]));
} else {
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( tv[INDEX] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( tv[INDEX] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
}
final public float calcDBsnpRate() {
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
((float) ti[FULL_SET] + tv[FULL_SET]);
}
final public float calcTPrate() {
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
((float) ti[FULL_SET] + tv[FULL_SET]);
}
final public int numVariants( final int INDEX ) {
if( INDEX == FULL_SET ) {
return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET];
} else {
return ti[INDEX] + tv[INDEX];
}
return ti[INDEX] + tv[INDEX];
}
final public void clearBin() {
value = 0.0f;
for( int iii = 0; iii < 3; iii++ ) {
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}