AnalyzeAnnotations now plots true positive rate instead of percentage of variants found in the truth set. Committing GCContentCovariate to help people experiment with correcting the pilot3/Kristian base calling error mode in slx.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2740 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-29 20:01:56 +00:00
parent ac2a207b0b
commit 4f29a1d4f6
5 changed files with 135 additions and 19 deletions

View File

@ -73,20 +73,26 @@ dev.off()
# Plot dbsnp and true positive rate as a function of the annotation
#
ymin = min(all$dbsnp)
ymax = max(all$dbsnp)
outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
t = all[all$truePositive>0,]
yLabel = "DBsnp/True Positive Rate"
ymin = min(min(all$dbsnp),min(t$truePositive))
ymax = max(max(all$dbsnp),max(t$truePositive))
}
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14);
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2)
abline(v=m75,lty=2)
abline(v=m25,lty=2)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(all$value,all$truePositive,col="magenta",pch=20);
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
@ -102,13 +108,13 @@ yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14);
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2)
abline(v=m75,lty=2)
abline(v=m25,lty=2)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(all$value,all$truePositive,col="magenta",pch=20);
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()

View File

@ -0,0 +1,84 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 29, 2010
*
* The number of previous N bases (along the direction of the read) that contain G's and C's.
* Only valid for Illumina reads. Otherwise return -1.
*/
public class GCContentCovariate implements ExperimentalCovariate {
int numBack = 7;
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {
numBack = RAC.HOMOPOLYMER_NBACK;
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final SAMRecord read, final int offset ) {
// ATTGCCCCGTAAAAAAAGAGAA
// 0000123456654321001122
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) ) {
int numGC = 0;
int startPos = 0;
int stopPos = 0;
final byte[] bases = read.getReadBases();
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
startPos = Math.max(offset - numBack, 0);
stopPos = Math.max(offset - 1, 0);
} else { // Negative direction
startPos = Math.min(offset + 2, bases.length);
stopPos = Math.min(offset + numBack + 1, bases.length);
}
for( int iii = startPos; iii < stopPos; iii++ ) {
if( bases[iii] == (byte)'G' || bases[iii] == (byte)'C' ) {
numGC++;
}
}
return numGC;
} else { // This effect is specific to the Illumina platform
return -1;
}
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) {
return Integer.parseInt( str );
}
}

View File

@ -1,12 +1,15 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.refdata.RodGLF;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
/*
* Copyright (c) 2010 The Broad Institute
@ -87,11 +90,27 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
if( tracker != null ) {
// First find out if this variant is in the truth set
boolean isInTruthSet = false;
boolean isTrueVariant = false;
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) {
isTrueVariant = true;
break; // at least one of the truth files has this variant, so break out
isInTruthSet = true;
if( rod instanceof RodVCF ) {
if( ((RodVCF) rod).isSNP() ) {
isTrueVariant = true;
}
} else if( rod instanceof RodGLF ) {
if( ((RodGLF) rod).isSNP() ) {
isTrueVariant = true;
}
} else if( rod instanceof VariantBackedByGenotype ) {
if( ((VariantBackedByGenotype) rod).getCalledGenotype().isVariant(ref.getBase()) ) {
isTrueVariant = true;
}
} else {
throw new StingException( "Truth ROD is of unknown ROD type: " + rod.getName() );
}
}
}
@ -100,7 +119,7 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
final RodVCF variant = (RodVCF) rod;
if( variant.isSNP() ) {
dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant );
dataManager.addAnnotations( variant, SAMPLE_NAME, isInTruthSet, isTrueVariant );
}
}
}

View File

@ -48,7 +48,7 @@ public class AnnotationDataManager {
data = new HashMap<String, TreeSet<AnnotationDatum>>();
}
public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) {
public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
@ -83,9 +83,9 @@ public class AnnotationDataManager {
// Decide if the variant is a transition or transversion
if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi( isNovelVariant, isTrueVariant );
datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant );
} else {
datum.incrementTv( isNovelVariant, isTrueVariant );
datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant );
}
}
}

View File

@ -45,7 +45,8 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
public static final int NOVEL_SET = 1;
public static final int DBSNP_SET = 2;
public static final int TRUTH_SET = 3;
private static final int NUM_SETS = 4;
public static final int TRUE_POSITIVE = 4;
private static final int NUM_SETS = 5;
public AnnotationDatum() {
@ -69,7 +70,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
}
}
final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
ti[FULL_SET]++;
if( isNovelVariant ) {
@ -77,12 +78,15 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
} else { // Is known, in DBsnp
ti[DBSNP_SET]++;
}
if( isTrueVariant ) {
if( isInTruthSet ) {
ti[TRUTH_SET]++;
if( isTrueVariant ) {
ti[TRUE_POSITIVE]++;
}
}
}
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
tv[FULL_SET]++;
if( isNovelVariant ) {
@ -90,8 +94,11 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
} else { // Is known, in DBsnp
tv[DBSNP_SET]++;
}
if( isTrueVariant ) {
if( isInTruthSet ) {
tv[TRUTH_SET]++;
if( isTrueVariant ) {
tv[TRUE_POSITIVE]++;
}
}
}
@ -129,12 +136,12 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
final public float calcTPrate() {
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
((float) ti[FULL_SET] + tv[FULL_SET]);
return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) /
((float) ti[TRUTH_SET] + tv[TRUTH_SET]);
}
final public int numVariants( final int INDEX ) {