Experimental support for sided annotations. Currently not more/less valuable than two-tailed testing. Future experiments are needed
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4864 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
21dc05138a
commit
32d5397c01
|
|
@ -28,8 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -38,6 +40,11 @@ import java.io.PrintStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class VariantDataManager {
|
public class VariantDataManager {
|
||||||
|
public enum TailType {
|
||||||
|
TWO_TAILED,
|
||||||
|
SMALL_IS_GOOD,
|
||||||
|
BIG_IS_GOOD
|
||||||
|
}
|
||||||
|
|
||||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||||
|
|
||||||
|
|
@ -46,6 +53,7 @@ public class VariantDataManager {
|
||||||
public final int numAnnotations;
|
public final int numAnnotations;
|
||||||
public final double[] meanVector;
|
public final double[] meanVector;
|
||||||
public final double[] varianceVector; // This is really the standard deviation
|
public final double[] varianceVector; // This is really the standard deviation
|
||||||
|
public final TailType[] tailTypes;
|
||||||
public boolean isNormalized;
|
public boolean isNormalized;
|
||||||
public final ExpandingArrayList<String> annotationKeys;
|
public final ExpandingArrayList<String> annotationKeys;
|
||||||
|
|
||||||
|
|
@ -59,6 +67,7 @@ public class VariantDataManager {
|
||||||
numAnnotations = 0;
|
numAnnotations = 0;
|
||||||
meanVector = null;
|
meanVector = null;
|
||||||
varianceVector = null;
|
varianceVector = null;
|
||||||
|
tailTypes = null;
|
||||||
} else {
|
} else {
|
||||||
numAnnotations = _annotationKeys.size();
|
numAnnotations = _annotationKeys.size();
|
||||||
if( numAnnotations <= 0 ) {
|
if( numAnnotations <= 0 ) {
|
||||||
|
|
@ -66,6 +75,8 @@ public class VariantDataManager {
|
||||||
}
|
}
|
||||||
meanVector = new double[numAnnotations];
|
meanVector = new double[numAnnotations];
|
||||||
varianceVector = new double[numAnnotations];
|
varianceVector = new double[numAnnotations];
|
||||||
|
tailTypes = new TailType[numAnnotations];
|
||||||
|
Arrays.fill(tailTypes, TailType.TWO_TAILED);
|
||||||
isNormalized = false;
|
isNormalized = false;
|
||||||
}
|
}
|
||||||
annotationKeys = _annotationKeys;
|
annotationKeys = _annotationKeys;
|
||||||
|
|
@ -79,6 +90,8 @@ public class VariantDataManager {
|
||||||
varianceVector = new double[numAnnotations];
|
varianceVector = new double[numAnnotations];
|
||||||
isNormalized = true;
|
isNormalized = true;
|
||||||
annotationKeys = new ExpandingArrayList<String>();
|
annotationKeys = new ExpandingArrayList<String>();
|
||||||
|
tailTypes = new TailType[numAnnotations];
|
||||||
|
Arrays.fill(tailTypes, TailType.TWO_TAILED);
|
||||||
|
|
||||||
int jjj = 0;
|
int jjj = 0;
|
||||||
for( final String line : annotationLines ) {
|
for( final String line : annotationLines ) {
|
||||||
|
|
@ -90,6 +103,22 @@ public class VariantDataManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// code for dealing with TailTypes
|
||||||
|
//
|
||||||
|
|
||||||
|
public void setAnnotationType(String annotation, TailType t ) {
|
||||||
|
int i = annotationKeys.indexOf(annotation);
|
||||||
|
if ( i == -1 ) throw new UserException.BadInput("Attempt to modify unknown annotation " + annotation + ": cluster file has " + Utils.join(",", annotationKeys));
|
||||||
|
tailTypes[i] = t;
|
||||||
|
logger.info("Updating annotation " + annotation + " to have tail type " + t);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TailType getAnnotationType(int i) {
|
||||||
|
return tailTypes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void normalizeData() {
|
public void normalizeData() {
|
||||||
boolean foundZeroVarianceAnnotation = false;
|
boolean foundZeroVarianceAnnotation = false;
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||||
|
|
||||||
|
private static final double MAX_SIGMA_TO_BE_IN_GAUSSIAN = 4.0;
|
||||||
|
|
||||||
public VariantGaussianMixtureModel() {
|
public VariantGaussianMixtureModel() {
|
||||||
dataManager = null;
|
dataManager = null;
|
||||||
maxGaussians = 0;
|
maxGaussians = 0;
|
||||||
|
|
@ -449,7 +451,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
|
|
||||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||||
final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false );
|
final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false );
|
||||||
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
|
||||||
|
double z = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
||||||
|
// switch ( dataManager.getAnnotationType(jjj) ) {
|
||||||
|
// case TWO_TAILED: // z is already fine
|
||||||
|
// break;
|
||||||
|
// case SMALL_IS_GOOD:
|
||||||
|
// if ( z < 0.0 ) {
|
||||||
|
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
|
||||||
|
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
|
||||||
|
// z = 0.0;
|
||||||
|
// }
|
||||||
|
// break;
|
||||||
|
// case BIG_IS_GOOD:
|
||||||
|
// if ( z > 0.0 ) {
|
||||||
|
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
|
||||||
|
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
|
||||||
|
// z = 0.0;
|
||||||
|
// }
|
||||||
|
// break;
|
||||||
|
// default:
|
||||||
|
// throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
|
||||||
|
// }
|
||||||
|
|
||||||
|
annotations[jjj] = z;
|
||||||
}
|
}
|
||||||
|
|
||||||
return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster );
|
return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster );
|
||||||
|
|
@ -783,9 +808,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
double value = 0.0;
|
double value = 0.0;
|
||||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||||
final double myMu = mu[kkk][ppp];
|
final double myMu = mu[kkk][ppp];
|
||||||
final double myAnn = annotations[ppp];
|
double myAnn = annotations[ppp];
|
||||||
final double mySigma = sigmaVals[ppp][jjj];
|
final double mySigma = sigmaVals[ppp][jjj];
|
||||||
value += (myAnn - myMu) * mySigma;
|
double z = (myAnn - myMu) * mySigma;
|
||||||
|
|
||||||
|
switch ( dataManager.getAnnotationType(jjj) ) {
|
||||||
|
case TWO_TAILED: break;
|
||||||
|
case SMALL_IS_GOOD: if( myAnn < myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
|
||||||
|
case BIG_IS_GOOD: if( myAnn > myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
|
||||||
|
default: throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
|
||||||
|
}
|
||||||
|
|
||||||
|
value += z;
|
||||||
}
|
}
|
||||||
final double jNorm = annotations[jjj] - mu[kkk][jjj];
|
final double jNorm = annotations[jjj] - mu[kkk][jjj];
|
||||||
final double prod = value * jNorm;
|
final double prod = value * jNorm;
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
*
|
*
|
||||||
|
|
@ -121,6 +122,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||||
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||||
|
|
||||||
|
// arguments for handling 1 sided annotations
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="use_annotation", shortName="an", doc="Allows us to provide information about the sidedness of annotations. Can be either -AN QD [two-sided], -AN QD- [values less than mean are good] and -AN QD+ [values greater than the mean are good]", required=false)
|
||||||
|
private List<String> USE_ANNOTATIONS = null;
|
||||||
|
|
||||||
public enum SelectionMetricType {
|
public enum SelectionMetricType {
|
||||||
NOVEL_TITV,
|
NOVEL_TITV,
|
||||||
TRUTH_SENSITIVITY
|
TRUTH_SENSITIVITY
|
||||||
|
|
@ -172,6 +178,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// deal with annotations
|
||||||
|
if ( USE_ANNOTATIONS != null ) {
|
||||||
|
for ( String annotation : USE_ANNOTATIONS ) {
|
||||||
|
VariantDataManager.TailType type = VariantDataManager.TailType.TWO_TAILED;
|
||||||
|
String annotationName = annotation;
|
||||||
|
if ( annotation.endsWith("-") ) {
|
||||||
|
type = VariantDataManager.TailType.SMALL_IS_GOOD;
|
||||||
|
annotationName = annotation.substring(0, annotation.length()-1);
|
||||||
|
}
|
||||||
|
else if ( annotation.endsWith("+") ) {
|
||||||
|
type = VariantDataManager.TailType.BIG_IS_GOOD;
|
||||||
|
annotationName = annotation.substring(0, annotation.length()-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
theModel.dataManager.setAnnotationType( annotationName, type );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
boolean foundDBSNP = false;
|
boolean foundDBSNP = false;
|
||||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||||
if( d.getName().startsWith("input") ) {
|
if( d.getName().startsWith("input") ) {
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ class recalibrate extends QScript {
|
||||||
@Argument(fullName = "prefix", doc="Prefix argument", required=false)
|
@Argument(fullName = "prefix", doc="Prefix argument", required=false)
|
||||||
var prefix: String = ""
|
var prefix: String = ""
|
||||||
|
|
||||||
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(4) }
|
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(3) }
|
||||||
|
|
||||||
class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) {
|
class Target(val name: String, val reference: File, val rodName: String, val VCF: File, val intervals: Option[String], val titvTarget: Double) {
|
||||||
def clusterFile = new File(name + ".clusters")
|
def clusterFile = new File(name + ".clusters")
|
||||||
|
|
@ -37,11 +37,20 @@ val TGPWExGdA = new Target("1000G.WEx.GdA", hg19, "b37", new File("/humgen/gsa-s
|
||||||
|
|
||||||
val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA)
|
val targets = List(HiSeq, WEx, LowPassN60, LowPassAugust, TGPWExFH, TGPWExGdA)
|
||||||
|
|
||||||
|
val allTailedAnnotations = List("QD+", "SB-", "HaplotypeScore-", "HRun-")
|
||||||
|
val someTailedAnnotations = List("QD", "SB-", "HaplotypeScore-", "HRun")
|
||||||
|
val twoTailedAnnotations = List("QD", "SB", "HaplotypeScore", "HRun")
|
||||||
|
|
||||||
def script = {
|
def script = {
|
||||||
for (target <- targets) {
|
for (target <- targets) {
|
||||||
add(new GenerateVariantClusters(target) )
|
add(new GenerateVariantClusters(target) )
|
||||||
add(new VariantRecalibratorTiTv(target) )
|
add(new VariantRecalibratorTiTv(target, someTailedAnnotations, ".sb.hs.tailed") )
|
||||||
add(new VariantRecalibratorNRS(target) )
|
add(new VariantRecalibratorNRS(target, someTailedAnnotations, ".sb.hs.tailed") )
|
||||||
|
add(new VariantRecalibratorTiTv(target, allTailedAnnotations, ".all.tailed") )
|
||||||
|
add(new VariantRecalibratorNRS(target, allTailedAnnotations, ".all.tailed") )
|
||||||
|
add(new VariantRecalibratorTiTv(target, twoTailedAnnotations, ".untailed") )
|
||||||
|
add(new VariantRecalibratorNRS(target, twoTailedAnnotations, ".untailed") )
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -66,7 +75,7 @@ class GenerateVariantClusters(t: Target) extends org.broadinstitute.sting.queue.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS {
|
class VariantRecalibratorBase(t: Target, ans: List[String]) extends org.broadinstitute.sting.queue.extensions.gatk.VariantRecalibrator with UNIVERSAL_GATK_ARGS {
|
||||||
this.reference_sequence = t.reference
|
this.reference_sequence = t.reference
|
||||||
this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod")
|
this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_" + t.rodName + ".rod")
|
||||||
this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf")
|
this.rodBind :+= RodBind("hapmap", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr." + t.rodName + "_fwd.vcf")
|
||||||
|
|
@ -80,18 +89,20 @@ class VariantRecalibratorBase(t: Target) extends org.broadinstitute.sting.queue.
|
||||||
this.priorDBSNP = Some(2.0)
|
this.priorDBSNP = Some(2.0)
|
||||||
this.priorHapMap = Some(2.0)
|
this.priorHapMap = Some(2.0)
|
||||||
this.target_titv = t.titvTarget
|
this.target_titv = t.titvTarget
|
||||||
|
this.use_annotation ++= ans
|
||||||
|
this.out = new File("/dev/null")
|
||||||
}
|
}
|
||||||
|
|
||||||
class VariantRecalibratorTiTv(t: Target) extends VariantRecalibratorBase(t) {
|
class VariantRecalibratorTiTv(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t, ans) {
|
||||||
this.tranche ++= List("0.1", "1.0", "10.0", "100.0")
|
this.tranche ++= List("0.1", "1.0", "10.0", "100.0")
|
||||||
this.out = new File(t.name + ".titv.recalibrated.vcf")
|
//this.out = new File(t.name + ".titv.recalibrated.vcf")
|
||||||
this.tranchesFile = new File(t.name + ".titv.tranches")
|
this.tranchesFile = new File(t.name + prefix + ".titv.tranches")
|
||||||
}
|
}
|
||||||
|
|
||||||
class VariantRecalibratorNRS(t: Target) extends VariantRecalibratorBase(t) {
|
class VariantRecalibratorNRS(t: Target, ans: List[String], prefix: String) extends VariantRecalibratorBase(t,ans) {
|
||||||
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
|
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
|
||||||
this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1")
|
this.tranche ++= List("50", "25", "10", "5", "2", "1", "0.5", "0.1")
|
||||||
this.out = new File(t.name + ".ts.recalibrated.vcf")
|
//this.out = new File(t.name + ".ts.recalibrated.vcf")
|
||||||
this.tranchesFile = new File(t.name + ".ts.tranches")
|
this.tranchesFile = new File(t.name + prefix + ".ts.tranches")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue