AnalyzeAnnotations now plots true positive rate instead of percentage of variants found in the truth set. Committing GCContentCovariate to help people experiment with correcting the pilot3/Kristian base calling error mode in slx.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2740 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ac2a207b0b
commit
4f29a1d4f6
|
|
@ -73,20 +73,26 @@ dev.off()
|
|||
# Plot dbsnp and true positive rate as a function of the annotation
|
||||
#
|
||||
|
||||
ymin = min(all$dbsnp)
|
||||
ymax = max(all$dbsnp)
|
||||
outfile = paste(input, ".truthRate.", annotationName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
yLabel = "DBsnp Rate"
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
yLabel = "DBsnp/Truth Rate"
|
||||
t = all[all$truePositive>0,]
|
||||
yLabel = "DBsnp/True Positive Rate"
|
||||
ymin = min(min(all$dbsnp),min(t$truePositive))
|
||||
ymax = max(max(all$dbsnp),max(t$truePositive))
|
||||
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,xaxt="n",ps=14);
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2)
|
||||
abline(v=m75,lty=2)
|
||||
abline(v=m25,lty=2)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(all$value,all$truePositive,col="magenta",pch=20);
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -102,13 +108,13 @@ yLabel = "DBsnp Rate"
|
|||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
yLabel = "DBsnp/Truth Rate"
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,pch=20,xaxt="n",ps=14);
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2)
|
||||
abline(v=m75,lty=2)
|
||||
abline(v=m25,lty=2)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(all$value,all$truePositive,col="magenta",pch=20);
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,84 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 29, 2010
|
||||
*
|
||||
* The number of previous N bases (along the direction of the read) that contain G's and C's.
|
||||
* Only valid for Illumina reads. Otherwise return -1.
|
||||
*/
|
||||
|
||||
public class GCContentCovariate implements ExperimentalCovariate {
|
||||
|
||||
int numBack = 7;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
numBack = RAC.HOMOPOLYMER_NBACK;
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
|
||||
// ATTGCCCCGTAAAAAAAGAGAA
|
||||
// 0000123456654321001122
|
||||
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) ) {
|
||||
int numGC = 0;
|
||||
int startPos = 0;
|
||||
int stopPos = 0;
|
||||
final byte[] bases = read.getReadBases();
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
startPos = Math.max(offset - numBack, 0);
|
||||
stopPos = Math.max(offset - 1, 0);
|
||||
} else { // Negative direction
|
||||
startPos = Math.min(offset + 2, bases.length);
|
||||
stopPos = Math.min(offset + numBack + 1, bases.length);
|
||||
}
|
||||
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) {
|
||||
if( bases[iii] == (byte)'G' || bases[iii] == (byte)'C' ) {
|
||||
numGC++;
|
||||
}
|
||||
}
|
||||
|
||||
return numGC;
|
||||
} else { // This effect is specific to the Illumina platform
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,12 +1,15 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RodGLF;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -87,11 +90,27 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
if( tracker != null ) {
|
||||
|
||||
// First find out if this variant is in the truth set
|
||||
boolean isInTruthSet = false;
|
||||
boolean isTrueVariant = false;
|
||||
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
|
||||
if( rod != null && rod.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||
isTrueVariant = true;
|
||||
break; // at least one of the truth files has this variant, so break out
|
||||
isInTruthSet = true;
|
||||
|
||||
if( rod instanceof RodVCF ) {
|
||||
if( ((RodVCF) rod).isSNP() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
} else if( rod instanceof RodGLF ) {
|
||||
if( ((RodGLF) rod).isSNP() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
} else if( rod instanceof VariantBackedByGenotype ) {
|
||||
if( ((VariantBackedByGenotype) rod).getCalledGenotype().isVariant(ref.getBase()) ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
} else {
|
||||
throw new StingException( "Truth ROD is of unknown ROD type: " + rod.getName() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -100,7 +119,7 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
if( rod != null && rod instanceof RodVCF && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||
final RodVCF variant = (RodVCF) rod;
|
||||
if( variant.isSNP() ) {
|
||||
dataManager.addAnnotations( variant, SAMPLE_NAME, isTrueVariant );
|
||||
dataManager.addAnnotations( variant, SAMPLE_NAME, isInTruthSet, isTrueVariant );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ public class AnnotationDataManager {
|
|||
data = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
}
|
||||
|
||||
public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isTrueVariant ) {
|
||||
public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
|
||||
if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
|
||||
|
|
@ -83,9 +83,9 @@ public class AnnotationDataManager {
|
|||
|
||||
// Decide if the variant is a transition or transversion
|
||||
if( BaseUtils.isTransition( (byte)variant.getReferenceForSNP(), (byte)variant.getAlternativeBaseForSNP()) ) {
|
||||
datum.incrementTi( isNovelVariant, isTrueVariant );
|
||||
datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant );
|
||||
} else {
|
||||
datum.incrementTv( isNovelVariant, isTrueVariant );
|
||||
datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,7 +45,8 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
|||
public static final int NOVEL_SET = 1;
|
||||
public static final int DBSNP_SET = 2;
|
||||
public static final int TRUTH_SET = 3;
|
||||
private static final int NUM_SETS = 4;
|
||||
public static final int TRUE_POSITIVE = 4;
|
||||
private static final int NUM_SETS = 5;
|
||||
|
||||
public AnnotationDatum() {
|
||||
|
||||
|
|
@ -69,7 +70,7 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
|
||||
final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
ti[FULL_SET]++;
|
||||
if( isNovelVariant ) {
|
||||
|
|
@ -77,12 +78,15 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
|||
} else { // Is known, in DBsnp
|
||||
ti[DBSNP_SET]++;
|
||||
}
|
||||
if( isTrueVariant ) {
|
||||
if( isInTruthSet ) {
|
||||
ti[TRUTH_SET]++;
|
||||
if( isTrueVariant ) {
|
||||
ti[TRUE_POSITIVE]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
|
||||
final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
tv[FULL_SET]++;
|
||||
if( isNovelVariant ) {
|
||||
|
|
@ -90,8 +94,11 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
|||
} else { // Is known, in DBsnp
|
||||
tv[DBSNP_SET]++;
|
||||
}
|
||||
if( isTrueVariant ) {
|
||||
if( isInTruthSet ) {
|
||||
tv[TRUTH_SET]++;
|
||||
if( isTrueVariant ) {
|
||||
tv[TRUE_POSITIVE]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -129,12 +136,12 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
|||
|
||||
final public float calcTPrate() {
|
||||
|
||||
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
|
||||
if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
|
||||
((float) ti[FULL_SET] + tv[FULL_SET]);
|
||||
return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) /
|
||||
((float) ti[TRUTH_SET] + tv[TRUTH_SET]);
|
||||
}
|
||||
|
||||
final public int numVariants( final int INDEX ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue