Experimental support for sided annotations. Currently not more/less valuable than two-tailed testing. Future experiments are needed

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4864 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-12-17 15:08:31 +00:00
parent 21dc05138a
commit 32d5397c01
4 changed files with 113 additions and 14 deletions

View File

@ -28,8 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Arrays;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -38,6 +40,11 @@ import java.io.PrintStream;
*/ */
public class VariantDataManager { public class VariantDataManager {
public enum TailType {
TWO_TAILED,
SMALL_IS_GOOD,
BIG_IS_GOOD
}
protected final static Logger logger = Logger.getLogger(VariantDataManager.class); protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
@ -46,6 +53,7 @@ public class VariantDataManager {
public final int numAnnotations; public final int numAnnotations;
public final double[] meanVector; public final double[] meanVector;
public final double[] varianceVector; // This is really the standard deviation public final double[] varianceVector; // This is really the standard deviation
public final TailType[] tailTypes;
public boolean isNormalized; public boolean isNormalized;
public final ExpandingArrayList<String> annotationKeys; public final ExpandingArrayList<String> annotationKeys;
@ -59,6 +67,7 @@ public class VariantDataManager {
numAnnotations = 0; numAnnotations = 0;
meanVector = null; meanVector = null;
varianceVector = null; varianceVector = null;
tailTypes = null;
} else { } else {
numAnnotations = _annotationKeys.size(); numAnnotations = _annotationKeys.size();
if( numAnnotations <= 0 ) { if( numAnnotations <= 0 ) {
@ -66,7 +75,9 @@ public class VariantDataManager {
} }
meanVector = new double[numAnnotations]; meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations]; varianceVector = new double[numAnnotations];
isNormalized = false; tailTypes = new TailType[numAnnotations];
Arrays.fill(tailTypes, TailType.TWO_TAILED);
isNormalized = false;
} }
annotationKeys = _annotationKeys; annotationKeys = _annotationKeys;
} }
@ -79,6 +90,8 @@ public class VariantDataManager {
varianceVector = new double[numAnnotations]; varianceVector = new double[numAnnotations];
isNormalized = true; isNormalized = true;
annotationKeys = new ExpandingArrayList<String>(); annotationKeys = new ExpandingArrayList<String>();
tailTypes = new TailType[numAnnotations];
Arrays.fill(tailTypes, TailType.TWO_TAILED);
int jjj = 0; int jjj = 0;
for( final String line : annotationLines ) { for( final String line : annotationLines ) {
@ -90,6 +103,22 @@ public class VariantDataManager {
} }
} }
//
// code for dealing with TailTypes
//
public void setAnnotationType(String annotation, TailType t ) {
int i = annotationKeys.indexOf(annotation);
if ( i == -1 ) throw new UserException.BadInput("Attempt to modify unknown annotation " + annotation + ": cluster file has " + Utils.join(",", annotationKeys));
tailTypes[i] = t;
logger.info("Updating annotation " + annotation + " to have tail type " + t);
}
public TailType getAnnotationType(int i) {
return tailTypes[i];
}
public void normalizeData() { public void normalizeData() {
boolean foundZeroVarianceAnnotation = false; boolean foundZeroVarianceAnnotation = false;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {

View File

@ -86,6 +86,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
private static final double MAX_SIGMA_TO_BE_IN_GAUSSIAN = 4.0;
public VariantGaussianMixtureModel() { public VariantGaussianMixtureModel() {
dataManager = null; dataManager = null;
maxGaussians = 0; maxGaussians = 0;
@ -449,7 +451,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false ); final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false );
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
double z = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
// switch ( dataManager.getAnnotationType(jjj) ) {
// case TWO_TAILED: // z is already fine
// break;
// case SMALL_IS_GOOD:
// if ( z < 0.0 ) {
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
// z = 0.0;
// }
// break;
// case BIG_IS_GOOD:
// if ( z > 0.0 ) {
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
// z = 0.0;
// }
// break;
// default:
// throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
// }
annotations[jjj] = z;
} }
return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster ); return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster );
@ -783,9 +808,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double value = 0.0; double value = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) { for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
final double myMu = mu[kkk][ppp]; final double myMu = mu[kkk][ppp];
final double myAnn = annotations[ppp]; double myAnn = annotations[ppp];
final double mySigma = sigmaVals[ppp][jjj]; final double mySigma = sigmaVals[ppp][jjj];
value += (myAnn - myMu) * mySigma; double z = (myAnn - myMu) * mySigma;
switch ( dataManager.getAnnotationType(jjj) ) {
case TWO_TAILED: break;
case SMALL_IS_GOOD: if( myAnn < myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
case BIG_IS_GOOD: if( myAnn > myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
default: throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
}
value += z;
} }
final double jNorm = annotations[jjj] - mu[kkk][jjj]; final double jNorm = annotations[jjj] - mu[kkk][jjj];
final double prod = value * jNorm; final double prod = value * jNorm;

View File

@ -1,3 +1,4 @@
/* /*
* Copyright (c) 2010 The Broad Institute * Copyright (c) 2010 The Broad Institute
* *
@ -121,6 +122,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false) @Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV; private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
// arguments for handling 1 sided annotations
@Hidden
@Argument(fullName="use_annotation", shortName="an", doc="Allows us to provide information about the sidedness of annotations. Can be either -AN QD [two-sided], -AN QD- [values less than mean are good] and -AN QD+ [values greater than the mean are good]", required=false)
private List<String> USE_ANNOTATIONS = null;
public enum SelectionMetricType { public enum SelectionMetricType {
NOVEL_TITV, NOVEL_TITV,
TRUTH_SENSITIVITY TRUTH_SENSITIVITY
@ -172,6 +178,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" ); throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
} }
// deal with annotations
if ( USE_ANNOTATIONS != null ) {
for ( String annotation : USE_ANNOTATIONS ) {
VariantDataManager.TailType type = VariantDataManager.TailType.TWO_TAILED;
String annotationName = annotation;
if ( annotation.endsWith("-") ) {
type = VariantDataManager.TailType.SMALL_IS_GOOD;
annotationName = annotation.substring(0, annotation.length()-1);
}
else if ( annotation.endsWith("+") ) {
type = VariantDataManager.TailType.BIG_IS_GOOD;
annotationName = annotation.substring(0, annotation.length()-1);
}
theModel.dataManager.setAnnotationType( annotationName, type );
}
}
boolean foundDBSNP = false; boolean foundDBSNP = false;
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) { if( d.getName().startsWith("input") ) {

View File

@ -18,7 +18,7 @@ class recalibrate extends QScript {
@Argument(fullName = "prefix", doc="Prefix argument", required=false) @Argument(fullName = "prefix", doc="Prefix argument", required=false)
var prefix: String = "" var prefix: String = ""
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(4) } trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(3) }
class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) { class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) {
def clusterFile = new File(name + ".clusters") def clusterFile = new File(name + ".clusters")
@ -37,11 +37,20 @@ val TGPWExGdA = new Target("1000G.WEx.GdA", hg19, "b37", new File("/humgen/gsa-s
val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA) val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA)
val allTailedAnnotations = List("QD+", "SB-", "HaplotypeScore-", "HRun-")
val someTailedAnnotations = List("QD", "SB-", "HaplotypeScore-", "HRun")
val twoTailedAnnotations = List("QD", "SB", "HaplotypeScore", "HRun")
def script = { def script = {
for (target <- targets) { for (target <- targets) {
add(new GenerateVariantClusters(target) ) add(new GenerateVariantClusters(target) )
add(new VariantRecalibratorTiTv(target) ) add(new VariantRecalibratorTiTv(target, someTailedAnnotations, ".sb.hs.tailed") )
add(new VariantRecalibratorNRS(target) ) add(new VariantRecalibratorNRS(target, someTailedAnnotations, ".sb.hs.tailed") )
add(new VariantRecalibratorTiTv(target, allTailedAnnotations, ".all.tailed") )
add(new VariantRecalibratorNRS(target, allTailedAnnotations, ".all.tailed") )
add(new VariantRecalibratorTiTv(target, twoTailedAnnotations, ".untailed") )
add(new VariantRecalibratorNRS(target, twoTailedAnnotations, ".untailed") )
} }
} }
@ -66,7 +75,7 @@ class GenerateVariantClusters(t: Target) extends org.broadinstitute.sting.queue.
} }
class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS { class VariantRecalibratorBase(t: Target, ans: List[String]) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS {
this.reference_sequence = t.reference this.reference_sequence = t.reference
this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod") this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod")
this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf") this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf")
@ -80,18 +89,20 @@ class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.
this.priorDBSNP = Some(2.0) this.priorDBSNP = Some(2.0)
this.priorHapMap = Some(2.0) this.priorHapMap = Some(2.0)
this.target_titv = t.titvTarget this.target_titv = t.titvTarget
this.use_annotation ++= ans
this.out = new File("/dev/null")
} }
class VariantRecalibratorTiTv(t: Target) extends VariantRecalibratorBase(t) { class VariantRecalibratorTiTv(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t, ans) {
this.tranche ++= List("0.1", "1.0", "10.0", "100.0") this.tranche ++= List("0.1", "1.0", "10.0", "100.0")
this.out = new File(t.name + ".titv.recalibrated.vcf") //this.out = new File(t.name + ".titv.recalibrated.vcf")
this.tranchesFile = new File(t.name + ".titv.tranches") this.tranchesFile = new File(t.name + prefix + ".titv.tranches")
} }
class VariantRecalibratorNRS(t: Target) extends VariantRecalibratorBase(t) { class VariantRecalibratorNRS(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t,ans) {
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY) this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1") this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1")
this.out = new File(t.name + ".ts.recalibrated.vcf") //this.out = new File(t.name + ".ts.recalibrated.vcf")
this.tranchesFile = new File(t.name + ".ts.tranches") this.tranchesFile = new File(t.name + prefix + ".ts.tranches")
} }
} }