Experimental support for sided annotations. Currently not more/less valuable than two-tailed testing. Future experiments are needed

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4864 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-12-17 15:08:31 +00:00
parent 21dc05138a
commit 32d5397c01
4 changed files with 113 additions and 14 deletions

View File

@ -28,8 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
@ -38,6 +40,11 @@ import java.io.PrintStream;
*/
public class VariantDataManager {
public enum TailType {
TWO_TAILED,
SMALL_IS_GOOD,
BIG_IS_GOOD
}
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
@ -46,6 +53,7 @@ public class VariantDataManager {
public final int numAnnotations;
public final double[] meanVector;
public final double[] varianceVector; // This is really the standard deviation
public final TailType[] tailTypes;
public boolean isNormalized;
public final ExpandingArrayList<String> annotationKeys;
@ -59,6 +67,7 @@ public class VariantDataManager {
numAnnotations = 0;
meanVector = null;
varianceVector = null;
tailTypes = null;
} else {
numAnnotations = _annotationKeys.size();
if( numAnnotations <= 0 ) {
@ -66,7 +75,9 @@ public class VariantDataManager {
}
meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations];
isNormalized = false;
tailTypes = new TailType[numAnnotations];
Arrays.fill(tailTypes, TailType.TWO_TAILED);
isNormalized = false;
}
annotationKeys = _annotationKeys;
}
@ -79,6 +90,8 @@ public class VariantDataManager {
varianceVector = new double[numAnnotations];
isNormalized = true;
annotationKeys = new ExpandingArrayList<String>();
tailTypes = new TailType[numAnnotations];
Arrays.fill(tailTypes, TailType.TWO_TAILED);
int jjj = 0;
for( final String line : annotationLines ) {
@ -90,6 +103,22 @@ public class VariantDataManager {
}
}
//
// code for dealing with TailTypes
//
public void setAnnotationType(String annotation, TailType t ) {
int i = annotationKeys.indexOf(annotation);
if ( i == -1 ) throw new UserException.BadInput("Attempt to modify unknown annotation " + annotation + ": cluster file has " + Utils.join(",", annotationKeys));
tailTypes[i] = t;
logger.info("Updating annotation " + annotation + " to have tail type " + t);
}
public TailType getAnnotationType(int i) {
return tailTypes[i];
}
public void normalizeData() {
boolean foundZeroVarianceAnnotation = false;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {

View File

@ -86,6 +86,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
private static final double MAX_SIGMA_TO_BE_IN_GAUSSIAN = 4.0;
public VariantGaussianMixtureModel() {
dataManager = null;
maxGaussians = 0;
@ -449,7 +451,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false );
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
double z = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
// switch ( dataManager.getAnnotationType(jjj) ) {
// case TWO_TAILED: // z is already fine
// break;
// case SMALL_IS_GOOD:
// if ( z < 0.0 ) {
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
// z = 0.0;
// }
// break;
// case BIG_IS_GOOD:
// if ( z > 0.0 ) {
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
// z = 0.0;
// }
// break;
// default:
// throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
// }
annotations[jjj] = z;
}
return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster );
@ -783,9 +808,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double value = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
final double myMu = mu[kkk][ppp];
final double myAnn = annotations[ppp];
double myAnn = annotations[ppp];
final double mySigma = sigmaVals[ppp][jjj];
value += (myAnn - myMu) * mySigma;
double z = (myAnn - myMu) * mySigma;
switch ( dataManager.getAnnotationType(jjj) ) {
case TWO_TAILED: break;
case SMALL_IS_GOOD: if( myAnn < myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
case BIG_IS_GOOD: if( myAnn > myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
default: throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
}
value += z;
}
final double jNorm = annotations[jjj] - mu[kkk][jjj];
final double prod = value * jNorm;

View File

@ -1,3 +1,4 @@
/*
* Copyright (c) 2010 The Broad Institute
*
@ -121,6 +122,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
// arguments for handling 1 sided annotations
@Hidden
@Argument(fullName="use_annotation", shortName="an", doc="Allows us to provide information about the sidedness of annotations. Can be either -AN QD [two-sided], -AN QD- [values less than mean are good] and -AN QD+ [values greater than the mean are good]", required=false)
private List<String> USE_ANNOTATIONS = null;
public enum SelectionMetricType {
NOVEL_TITV,
TRUTH_SENSITIVITY
@ -172,6 +178,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
// deal with annotations
if ( USE_ANNOTATIONS != null ) {
for ( String annotation : USE_ANNOTATIONS ) {
VariantDataManager.TailType type = VariantDataManager.TailType.TWO_TAILED;
String annotationName = annotation;
if ( annotation.endsWith("-") ) {
type = VariantDataManager.TailType.SMALL_IS_GOOD;
annotationName = annotation.substring(0, annotation.length()-1);
}
else if ( annotation.endsWith("+") ) {
type = VariantDataManager.TailType.BIG_IS_GOOD;
annotationName = annotation.substring(0, annotation.length()-1);
}
theModel.dataManager.setAnnotationType( annotationName, type );
}
}
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {

View File

@ -18,7 +18,7 @@ class recalibrate extends QScript {
@Argument(fullName = "prefix", doc="Prefix argument", required=false)
var prefix: String = ""
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(4) }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(3) }
class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) {
def clusterFile = new File(name + ".clusters")
@ -37,11 +37,20 @@ val TGPWExGdA = new Target("1000G.WEx.GdA", hg19, "b37", new File("/humgen/gsa-s
val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA)
val allTailedAnnotations = List("QD+", "SB-", "HaplotypeScore-", "HRun-")
val someTailedAnnotations = List("QD", "SB-", "HaplotypeScore-", "HRun")
val twoTailedAnnotations = List("QD", "SB", "HaplotypeScore", "HRun")
def script = {
for (target <- targets) {
add(new GenerateVariantClusters(target) )
add(new VariantRecalibratorTiTv(target) )
add(new VariantRecalibratorNRS(target) )
add(new VariantRecalibratorTiTv(target, someTailedAnnotations, ".sb.hs.tailed") )
add(new VariantRecalibratorNRS(target, someTailedAnnotations, ".sb.hs.tailed") )
add(new VariantRecalibratorTiTv(target, allTailedAnnotations, ".all.tailed") )
add(new VariantRecalibratorNRS(target, allTailedAnnotations, ".all.tailed") )
add(new VariantRecalibratorTiTv(target, twoTailedAnnotations, ".untailed") )
add(new VariantRecalibratorNRS(target, twoTailedAnnotations, ".untailed") )
}
}
@ -66,7 +75,7 @@ class GenerateVariantClusters(t: Target) extends org.broadinstitute.sting.queue.
}
class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS {
class VariantRecalibratorBase(t: Target, ans: List[String]) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS {
this.reference_sequence = t.reference
this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod")
this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf")
@ -80,18 +89,20 @@ class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.
this.priorDBSNP = Some(2.0)
this.priorHapMap = Some(2.0)
this.target_titv = t.titvTarget
this.use_annotation ++= ans
this.out = new File("/dev/null")
}
class VariantRecalibratorTiTv(t: Target) extends VariantRecalibratorBase(t) {
class VariantRecalibratorTiTv(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t, ans) {
this.tranche ++= List("0.1", "1.0", "10.0", "100.0")
this.out = new File(t.name + ".titv.recalibrated.vcf")
this.tranchesFile = new File(t.name + ".titv.tranches")
//this.out = new File(t.name + ".titv.recalibrated.vcf")
this.tranchesFile = new File(t.name + prefix + ".titv.tranches")
}
class VariantRecalibratorNRS(t: Target) extends VariantRecalibratorBase(t) {
class VariantRecalibratorNRS(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t,ans) {
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1")
this.out = new File(t.name + ".ts.recalibrated.vcf")
this.tranchesFile = new File(t.name + ".ts.tranches")
//this.out = new File(t.name + ".ts.recalibrated.vcf")
this.tranchesFile = new File(t.name + prefix + ".ts.tranches")
}
}