Variant Recalibrator now makes use of a prior on known/novel status as well as on allele frequency spectrum. The VariantOptimizer walker now clusters with all variants but gives more weight to knowns / hapmap / 1KG / MQ1 sites. The weights are all optional command line arguments. We no longer assign default values to annotations that are malformed. The walkers will crash with exception so as to not cover up potential issues. We only produce titv-less clusters now, and so the titv argument in VO was removed and the WithoutTiTv string that gets added to the cluster file is removed. The wiki is updated to show new example commands.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3433 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ae6c014884
commit
bf530d23de
|
|
@ -71,7 +71,6 @@ public class DbSNPHelper {
|
||||||
return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del");
|
return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static boolean isHapmap(DbSNPFeature feature) {
|
public static boolean isHapmap(DbSNPFeature feature) {
|
||||||
return feature.getValidationStatus().contains("by-hapmap");
|
return feature.getValidationStatus().contains("by-hapmap");
|
||||||
}
|
}
|
||||||
|
|
@ -80,6 +79,14 @@ public class DbSNPHelper {
|
||||||
return feature.getValidationStatus().contains("by-2hit-2allele");
|
return feature.getValidationStatus().contains("by-2hit-2allele");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean is1000genomes(DbSNPFeature feature) {
|
||||||
|
return feature.getValidationStatus().contains("by-1000genomes");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isMQ1(DbSNPFeature feature) {
|
||||||
|
return feature.getWeight() == 1;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* gets the alternate alleles. This method should return all the alleles present at the location,
|
* gets the alternate alleles. This method should return all the alleles present at the location,
|
||||||
* NOT including the reference base. This is returned as a string list with no guarantee ordering
|
* NOT including the reference base. This is returned as a string list with no guarantee ordering
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
|
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
|
@ -60,7 +61,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
|
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
|
||||||
private double TARGET_TITV = 2.1;
|
private double TARGET_TITV = 2.1;
|
||||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
||||||
private double BACKOFF_FACTOR = 1.0;
|
private double BACKOFF_FACTOR = 1.0;
|
||||||
|
|
@ -70,8 +71,12 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||||
private String[] IGNORE_INPUT_FILTERS = null;
|
private String[] IGNORE_INPUT_FILTERS = null;
|
||||||
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true. Setting to 0.0 means unused.", required=false)
|
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
|
||||||
private double KNOWN_VAR_QUAL_PRIOR = 0.0;
|
private int KNOWN_QUAL_PRIOR = 9;
|
||||||
|
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||||
|
private int NOVEL_QUAL_PRIOR = 2;
|
||||||
|
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
|
||||||
|
private double QUALITY_SCALE_FACTOR = 50.0;
|
||||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||||
private String OUTPUT_PREFIX = "optimizer";
|
private String OUTPUT_PREFIX = "optimizer";
|
||||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||||
|
|
@ -90,8 +95,6 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
private VCFWriter vcfWriter;
|
private VCFWriter vcfWriter;
|
||||||
private Set<String> ignoreInputFilterSet = null;
|
private Set<String> ignoreInputFilterSet = null;
|
||||||
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
|
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
|
||||||
private boolean usingDBSNP = false;
|
|
||||||
|
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
@ -130,9 +133,9 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
|
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
|
||||||
final TreeSet<String> samples = new TreeSet<String>();
|
final TreeSet<String> samples = new TreeSet<String>();
|
||||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||||
for ( final ReferenceOrderedDataSource source : dataSources ) {
|
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||||
final RMDTrack rod = source.getReferenceOrderedData();
|
final RMDTrack rod = source.getReferenceOrderedData();
|
||||||
if ( rod.getType().equals(VCFCodec.class) ) {
|
if( rod.getType().equals(VCFCodec.class) ) {
|
||||||
final VCFReader reader = new VCFReader(rod.getFile());
|
final VCFReader reader = new VCFReader(rod.getFile());
|
||||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||||
samples.addAll(vcfSamples);
|
samples.addAll(vcfSamples);
|
||||||
|
|
@ -142,12 +145,17 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||||
vcfWriter.writeHeader(vcfHeader);
|
vcfWriter.writeHeader(vcfHeader);
|
||||||
|
|
||||||
|
boolean foundDBSNP = false;
|
||||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||||
final RMDTrack rod = source.getReferenceOrderedData();
|
final RMDTrack rod = source.getReferenceOrderedData();
|
||||||
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
if( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||||
usingDBSNP = true;
|
foundDBSNP = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(!foundDBSNP) {
|
||||||
|
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -169,27 +177,17 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||||
final VariantDatum variantDatum = new VariantDatum();
|
final VariantDatum variantDatum = new VariantDatum();
|
||||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||||
boolean isKnown = !vc.getAttribute("ID").equals(".");
|
|
||||||
if(usingDBSNP) {
|
|
||||||
isKnown = false;
|
|
||||||
for( VariantContext dbsnpVC : tracker.getVariantContexts(ref, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
|
|
||||||
if(dbsnpVC != null && dbsnpVC.isSNP()) {
|
|
||||||
isKnown=true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
variantDatum.isKnown = isKnown;
|
|
||||||
|
|
||||||
final double pTrue = theModel.evaluateVariant( vc );
|
|
||||||
double recalQual = 400.0 * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) );
|
|
||||||
|
|
||||||
if( variantDatum.isKnown && KNOWN_VAR_QUAL_PRIOR > 0.1 ) { // only use the known prior if the value is specified (meaning not equal to zero)
|
|
||||||
variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR;
|
|
||||||
} else {
|
|
||||||
variantDatum.qual = recalQual;
|
|
||||||
}
|
|
||||||
mapList.add( variantDatum );
|
|
||||||
|
|
||||||
|
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||||
|
variantDatum.isKnown = dbsnp != null;
|
||||||
|
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
|
||||||
|
|
||||||
|
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
|
||||||
|
final double knownPrior = ( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
|
||||||
|
final double pTrue = theModel.evaluateVariant( vc ) * acPrior * knownPrior;
|
||||||
|
|
||||||
|
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
|
||||||
|
mapList.add( variantDatum );
|
||||||
|
|
||||||
vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() );
|
vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() );
|
||||||
vcf.setQual( variantDatum.qual );
|
vcf.setQual( variantDatum.qual );
|
||||||
|
|
|
||||||
|
|
@ -36,4 +36,6 @@ public class VariantDatum {
|
||||||
public boolean isTransition;
|
public boolean isTransition;
|
||||||
public boolean isKnown;
|
public boolean isKnown;
|
||||||
public double qual;
|
public double qual;
|
||||||
|
public double weight;
|
||||||
|
public int alleleCount;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,22 +58,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
private final double MIN_SIGMA = 1E-5;
|
private final double MIN_SIGMA = 1E-5;
|
||||||
private final double MIN_DETERMINANT = 1E-5;
|
private final double MIN_DETERMINANT = 1E-5;
|
||||||
|
|
||||||
private final double[][] mu; // The means for the clusters
|
private final double[][] mu; // The means for each cluster
|
||||||
private final Matrix[] sigma; // The variances for the clusters, sigma is really sigma^2
|
private final Matrix[] sigma; // The covariance matrix for each cluster
|
||||||
private final Matrix[] sigmaInverse;
|
private final Matrix[] sigmaInverse;
|
||||||
private final double[] pCluster;
|
private final double[] pCluster;
|
||||||
private final double[] determinant;
|
private final double[] determinant;
|
||||||
private final double[] clusterTITV;
|
private final double[] alleleCountFactorArray;
|
||||||
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
|
||||||
private final int minVarInCluster;
|
private final int minVarInCluster;
|
||||||
public final boolean isUsingTiTvModel;
|
|
||||||
|
|
||||||
private static final double INFINITE_ANNOTATION_VALUE = 6000.0;
|
|
||||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||||
|
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
|
||||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||||
|
|
||||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
|
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final int maxAC ) {
|
||||||
super( _targetTITV );
|
|
||||||
dataManager = _dataManager;
|
dataManager = _dataManager;
|
||||||
numGaussians = _numGaussians;
|
numGaussians = _numGaussians;
|
||||||
numIterations = _numIterations;
|
numIterations = _numIterations;
|
||||||
|
|
@ -82,22 +79,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
sigma = new Matrix[numGaussians];
|
sigma = new Matrix[numGaussians];
|
||||||
determinant = new double[numGaussians];
|
determinant = new double[numGaussians];
|
||||||
pCluster = new double[numGaussians];
|
pCluster = new double[numGaussians];
|
||||||
clusterTITV = new double[numGaussians];
|
alleleCountFactorArray = new double[maxAC + 1];
|
||||||
clusterTruePositiveRate = new double[numGaussians];
|
|
||||||
minVarInCluster = _minVarInCluster;
|
minVarInCluster = _minVarInCluster;
|
||||||
sigmaInverse = null;
|
sigmaInverse = null; // This field isn't used during VariantOptimizer pass
|
||||||
isUsingTiTvModel = false; // this field isn't used during VariantOptimizerWalker
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
|
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
|
||||||
super( _targetTITV );
|
super( _targetTITV );
|
||||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||||
|
final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
|
||||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for ( String line : new XReadLines(new File( clusterFileName )) ) {
|
for ( final String line : new XReadLines(new File( clusterFileName )) ) {
|
||||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||||
annotationLines.add(line);
|
annotationLines.add(line);
|
||||||
|
} else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
|
||||||
|
alleleCountLines.add(line);
|
||||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||||
clusterLines.add(line);
|
clusterLines.add(line);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -111,7 +109,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
dataManager = new VariantDataManager( annotationLines );
|
dataManager = new VariantDataManager( annotationLines );
|
||||||
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters
|
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters
|
||||||
numIterations = 0;
|
numIterations = 0;
|
||||||
clusterTITV = null;
|
|
||||||
minVarInCluster = 0;
|
minVarInCluster = 0;
|
||||||
|
|
||||||
// BUGBUG: move this parsing out of the constructor
|
// BUGBUG: move this parsing out of the constructor
|
||||||
|
|
@ -122,19 +119,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
sigmaInverse = new Matrix[numGaussians];
|
sigmaInverse = new Matrix[numGaussians];
|
||||||
pCluster = new double[numGaussians];
|
pCluster = new double[numGaussians];
|
||||||
determinant = new double[numGaussians];
|
determinant = new double[numGaussians];
|
||||||
clusterTruePositiveRate = new double[numGaussians];
|
|
||||||
boolean _isUsingTiTvModel = false;
|
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
|
||||||
|
for( final String line : alleleCountLines ) {
|
||||||
|
final String[] vals = line.split(",");
|
||||||
|
alleleCountFactorArray[Integer.parseInt(vals[1])] = Double.parseDouble(vals[2]);
|
||||||
|
}
|
||||||
|
|
||||||
int kkk = 0;
|
int kkk = 0;
|
||||||
for( String line : clusterLines ) {
|
for( final String line : clusterLines ) {
|
||||||
final String[] vals = line.split(",");
|
final String[] vals = line.split(",");
|
||||||
pCluster[kkk] = Double.parseDouble(vals[1]);
|
pCluster[kkk] = Double.parseDouble(vals[1]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
|
||||||
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[3]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
|
|
||||||
if( clusterTruePositiveRate[kkk] != 1.0 ) { _isUsingTiTvModel = true; }
|
|
||||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||||
mu[kkk][jjj] = Double.parseDouble(vals[4+jjj]);
|
mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]);
|
||||||
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
|
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
|
||||||
sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[4+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor; // BUGBUG: *3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points
|
sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[2+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -143,93 +142,48 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
determinant[kkk] = sigma[kkk].det();
|
determinant[kkk] = sigma[kkk].det();
|
||||||
kkk++;
|
kkk++;
|
||||||
}
|
}
|
||||||
isUsingTiTvModel = _isUsingTiTvModel;
|
|
||||||
|
|
||||||
logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void run( final String clusterFileName ) {
|
public final void run( final String clusterFileName ) {
|
||||||
|
|
||||||
final int MAX_KNOWN_VARS = 5000000; // BUGBUG: make this a command line argument
|
// Initialize the Allele Count prior
|
||||||
final int MAX_NOVEL_VARS = 5000000; // BUGBUG: make this a command line argument
|
generateAlleleCountPrior();
|
||||||
final double knownNovelMixture = 1.5; // BUGBUG: make this a command line argument
|
|
||||||
|
|
||||||
// Create the subset of the data to cluster with
|
// Simply cluster with all the variants. The knowns have been given more weight than the novels
|
||||||
int numNovel = 0;
|
logger.info("Clustering with " + dataManager.data.length + " variants.");
|
||||||
int numKnown = 0;
|
createClusters( dataManager.data, 0, numGaussians, clusterFileName, false );
|
||||||
|
}
|
||||||
|
|
||||||
|
private void generateAlleleCountPrior() {
|
||||||
|
|
||||||
|
final double[] acExpectation = new double[alleleCountFactorArray.length];
|
||||||
|
final double[] acActual = new double[alleleCountFactorArray.length];
|
||||||
|
final int[] alleleCount = new int[alleleCountFactorArray.length];
|
||||||
|
|
||||||
|
double sumExpectation = 0.0;
|
||||||
|
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||||
|
acExpectation[iii] = 1.0 / ((double) iii);
|
||||||
|
sumExpectation += acExpectation[iii];
|
||||||
|
}
|
||||||
|
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||||
|
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
|
||||||
|
alleleCount[iii] = 0;
|
||||||
|
}
|
||||||
for( final VariantDatum datum : dataManager.data ) {
|
for( final VariantDatum datum : dataManager.data ) {
|
||||||
if( datum.isKnown ) {
|
alleleCount[datum.alleleCount]++;
|
||||||
numKnown++;
|
|
||||||
} else {
|
|
||||||
numNovel++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||||
final int numNovelCluster = Math.min( numNovel, MAX_NOVEL_VARS );
|
acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution
|
||||||
final int numKnownCluster = Math.min( numKnown, MAX_KNOWN_VARS );
|
|
||||||
final int numKnownTogether = Math.min( numKnownCluster, (int) Math.floor(knownNovelMixture * numNovelCluster) );
|
|
||||||
|
|
||||||
final VariantDatum[] dataTogether = new VariantDatum[numNovelCluster + numKnownTogether];
|
|
||||||
final VariantDatum[] dataKnown = new VariantDatum[numKnownCluster];
|
|
||||||
|
|
||||||
// Create the dataTogether array, which is all the novels and 1.5x as many knowns, downsampled if there are too many
|
|
||||||
int iii = 0;
|
|
||||||
if( numNovelCluster == numNovel ) {
|
|
||||||
for( final VariantDatum datum : dataManager.data ) {
|
|
||||||
if( !datum.isKnown ) {
|
|
||||||
dataTogether[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
logger.info("Capped at " + MAX_NOVEL_VARS + " novel variants.");
|
|
||||||
while( iii < numNovelCluster ) {
|
|
||||||
final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)];
|
|
||||||
if( !datum.isKnown ) {
|
|
||||||
dataTogether[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if( numKnownTogether == numKnown ) {
|
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||||
for( final VariantDatum datum : dataManager.data ) {
|
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
|
||||||
if( datum.isKnown ) {
|
|
||||||
dataTogether[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
while( iii < numNovelCluster + numKnownTogether ) {
|
|
||||||
final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)];
|
|
||||||
if( datum.isKnown ) {
|
|
||||||
dataTogether[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Create the dataKnown array, which is simply all the known vars or downsampled if there are too many
|
public final double getAlleleCountPrior( final int alleleCount ) {
|
||||||
iii = 0;
|
return alleleCountFactorArray[alleleCount];
|
||||||
if( numKnownCluster == numKnown ) {
|
|
||||||
for( final VariantDatum datum : dataManager.data ) {
|
|
||||||
if( datum.isKnown ) {
|
|
||||||
dataKnown[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
logger.info("Capped at " + MAX_KNOWN_VARS + " known variants.");
|
|
||||||
while( iii < numKnownCluster ) {
|
|
||||||
final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)];
|
|
||||||
if( datum.isKnown ) {
|
|
||||||
dataKnown[iii++] = datum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final boolean useTITV = true;
|
|
||||||
logger.info("First, cluster with novels and knowns together to use ti/tv based models:");
|
|
||||||
logger.info("Clustering with " + numNovelCluster + " novel variants and " + numKnownTogether + " known variants.");
|
|
||||||
createClusters( dataTogether, 0, numGaussians, clusterFileName, useTITV );
|
|
||||||
|
|
||||||
logger.info("Finally, cluster with only knowns to use ti/tv-less models:");
|
|
||||||
logger.info("Clustering with " + numKnownCluster + " known variants.");
|
|
||||||
createClusters( dataKnown, 0, numGaussians, clusterFileName, !useTITV );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
|
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
|
||||||
|
|
@ -277,108 +231,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
|
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Estimate each cluster's p(true) and output cluster parameters
|
// Output cluster parameters at each iteration
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
outputGaussians( data, pVarInCluster, ttt+1, startCluster, stopCluster, clusterFileName, useTITV );
|
printClusterParameters( clusterFileName + "." + (ttt+1) );
|
||||||
|
|
||||||
logger.info("Finished iteration " + (ttt+1) );
|
logger.info("Finished iteration " + (ttt+1) );
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private void outputGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int iterationNumber,
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
|
// Output the final cluster parameters
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
if( !useTITV ) {
|
printClusterParameters( clusterFileName );
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
|
||||||
clusterTITV[kkk] = 0.0;
|
|
||||||
clusterTruePositiveRate[kkk] = 1.0;
|
|
||||||
}
|
|
||||||
printClusterParameters( clusterFileName + ".WithoutTiTv." + iterationNumber );
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int numVariants = data.length;
|
|
||||||
|
|
||||||
final double[] probTi = new double[numGaussians];
|
|
||||||
final double[] probTv = new double[numGaussians];
|
|
||||||
final double[] probKnown = new double[numGaussians];
|
|
||||||
final double[] probNovel = new double[numGaussians];
|
|
||||||
final double[] probKnownTi = new double[numGaussians];
|
|
||||||
final double[] probKnownTv = new double[numGaussians];
|
|
||||||
final double[] probNovelTi = new double[numGaussians];
|
|
||||||
final double[] probNovelTv = new double[numGaussians];
|
|
||||||
|
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
|
||||||
probTi[kkk] = 0.0;
|
|
||||||
probTv[kkk] = 0.0;
|
|
||||||
probKnown[kkk] = 0.0;
|
|
||||||
probNovel[kkk] = 0.0;
|
|
||||||
probKnownTi[kkk] = 0.0;
|
|
||||||
probKnownTv[kkk] = 0.0;
|
|
||||||
probNovelTi[kkk] = 0.0;
|
|
||||||
probNovelTv[kkk] = 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use the cluster's probabilistic Ti/Tv ratio as the indication of the cluster's true positive rate
|
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
|
||||||
final boolean isTransition = data[iii].isTransition;
|
|
||||||
final boolean isKnown = data[iii].isKnown;
|
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
|
||||||
final double prob = pVarInCluster[kkk][iii];
|
|
||||||
if( isKnown ) { // known
|
|
||||||
probKnown[kkk] += prob;
|
|
||||||
if( isTransition ) { // transition
|
|
||||||
probKnownTi[kkk] += prob;
|
|
||||||
probTi[kkk] += prob;
|
|
||||||
} else { // transversion
|
|
||||||
probKnownTv[kkk] += prob;
|
|
||||||
probTv[kkk] += prob;
|
|
||||||
}
|
|
||||||
} else { //novel
|
|
||||||
probNovel[kkk] += prob;
|
|
||||||
if( isTransition ) { // transition
|
|
||||||
probNovelTi[kkk] += prob;
|
|
||||||
probTi[kkk] += prob;
|
|
||||||
} else { // transversion
|
|
||||||
probNovelTv[kkk] += prob;
|
|
||||||
probTv[kkk] += prob;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for( int ttt = 0; ttt < 3; ttt++ ) {
|
|
||||||
double knownAlphaFactor = 0.0;
|
|
||||||
if( ttt == 0 ) {
|
|
||||||
knownAlphaFactor = 0.0;
|
|
||||||
} else if( ttt == 1 ) {
|
|
||||||
knownAlphaFactor = 1.0;
|
|
||||||
} else if( ttt == 2 ) {
|
|
||||||
knownAlphaFactor = 0.5;
|
|
||||||
}
|
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
|
||||||
clusterTITV[kkk] = probTi[kkk] / probTv[kkk];
|
|
||||||
if( probKnown[kkk] > 500.0 && probNovel[kkk] > 500.0 ) {
|
|
||||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk], clusterTITV[kkk], knownAlphaFactor );
|
|
||||||
} else {
|
|
||||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( ttt == 0 ) {
|
|
||||||
printClusterParameters( clusterFileName + ".TargetTiTv." + iterationNumber );
|
|
||||||
} else if( ttt == 1 ) {
|
|
||||||
printClusterParameters( clusterFileName + ".KnownTiTv." + iterationNumber );
|
|
||||||
} else if( ttt == 2 ) {
|
|
||||||
printClusterParameters( clusterFileName + ".BlendedTiTv." + iterationNumber );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printClusterParameters( final String clusterFileName ) {
|
private void printClusterParameters( final String clusterFileName ) {
|
||||||
try {
|
try {
|
||||||
final PrintStream outputFile = new PrintStream( clusterFileName );
|
final PrintStream outputFile = new PrintStream( clusterFileName );
|
||||||
dataManager.printClusterFileHeader( outputFile );
|
dataManager.printClusterFileHeader( outputFile );
|
||||||
|
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||||
|
outputFile.print("@!ALLELECOUNT,");
|
||||||
|
outputFile.println(iii + "," + alleleCountFactorArray[iii]);
|
||||||
|
}
|
||||||
|
|
||||||
final int numAnnotations = mu[0].length;
|
final int numAnnotations = mu[0].length;
|
||||||
final int numVariants = dataManager.numVariants;
|
final int numVariants = dataManager.numVariants;
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
|
|
@ -386,8 +260,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
final double sigmaVals[][] = sigma[kkk].getArray();
|
final double sigmaVals[][] = sigma[kkk].getArray();
|
||||||
outputFile.print("@!CLUSTER,");
|
outputFile.print("@!CLUSTER,");
|
||||||
outputFile.print(pCluster[kkk] + ",");
|
outputFile.print(pCluster[kkk] + ",");
|
||||||
outputFile.print(clusterTITV[kkk] + ",");
|
|
||||||
outputFile.print(clusterTruePositiveRate[kkk] + ",");
|
|
||||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
outputFile.print(mu[kkk][jjj] + ",");
|
outputFile.print(mu[kkk][jjj] + ",");
|
||||||
}
|
}
|
||||||
|
|
@ -406,19 +278,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
|
|
||||||
public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) {
|
public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) {
|
||||||
double value = 0.0;
|
double value;
|
||||||
if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
|
//if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
|
||||||
value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
|
// value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
|
||||||
} else if( annotationKey.equals("QUAL") ) {
|
//}
|
||||||
|
if( annotationKey.equals("QUAL") ) {
|
||||||
value = vc.getPhredScaledQual();
|
value = vc.getPhredScaledQual();
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey, "0.0" ) );
|
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||||
if( Double.isInfinite(value) ) {
|
|
||||||
value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
|
|
||||||
}
|
|
||||||
} catch( NumberFormatException e ) {
|
} catch( NumberFormatException e ) {
|
||||||
// do nothing, default value is 0.0
|
throw new StingException("No double value detected for annotation = " + annotationKey +
|
||||||
|
" in variant at " + vc.getLocation() + ", reported annotation value = " + vc.getAttribute( annotationKey ) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return value;
|
return value;
|
||||||
|
|
@ -437,7 +308,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
|
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
@ -496,7 +367,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
|
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
|
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
|
||||||
|
|
||||||
final int numVariants = data.length;
|
final int numVariants = data.length;
|
||||||
|
|
@ -669,8 +539,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values.");
|
throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
|
if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem
|
||||||
pVarInCluster[kkk][iii] = MIN_PROB;// + MIN_PROB * rand.nextDouble();
|
pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble();
|
||||||
}
|
}
|
||||||
|
|
||||||
sumProb += pVarInCluster[kkk][iii];
|
sumProb += pVarInCluster[kkk][iii];
|
||||||
|
|
@ -678,11 +548,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
|
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||||
pVarInCluster[kkk][iii] /= sumProb;
|
pVarInCluster[kkk][iii] /= sumProb;
|
||||||
|
pVarInCluster[kkk][iii] *= data[iii].weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / ((double) data.length)));
|
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -825,7 +695,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
// Replace extremely small clusters with another random draw from the dataset
|
// Replace extremely small clusters with another random draw from the dataset
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||||
if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) ||
|
if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) ||
|
||||||
|
|
@ -855,5 +725,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||||
pCluster[kkk] /= sumPK;
|
pCluster[kkk] /= sumPK;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -40,6 +40,10 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
||||||
|
|
||||||
protected final double targetTITV;
|
protected final double targetTITV;
|
||||||
|
|
||||||
|
public VariantOptimizationModel() {
|
||||||
|
targetTITV = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
public VariantOptimizationModel( final double _targetTITV ) {
|
public VariantOptimizationModel( final double _targetTITV ) {
|
||||||
targetTITV = _targetTITV;
|
targetTITV = _targetTITV;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
|
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
|
@ -55,8 +56,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
|
|
||||||
private double TARGET_TITV = 2.1;
|
|
||||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||||
|
|
@ -65,17 +64,24 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
private String[] USE_ANNOTATIONS = null;
|
private String[] USE_ANNOTATIONS = null;
|
||||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||||
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
|
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false)
|
||||||
private int NUM_GAUSSIANS = 1;
|
private int NUM_GAUSSIANS = 6;
|
||||||
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
|
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed when clustering", required=false)
|
||||||
private int NUM_ITERATIONS = 10;
|
private int NUM_ITERATIONS = 10;
|
||||||
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
|
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
|
||||||
private int MIN_VAR_IN_CLUSTER = 1000;
|
private int MIN_VAR_IN_CLUSTER = 0;
|
||||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||||
private String PATH_TO_RESOURCES = "R/";
|
private String PATH_TO_RESOURCES = "R/";
|
||||||
|
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false)
|
||||||
|
private double WEIGHT_KNOWNS = 8.0;
|
||||||
|
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
|
||||||
|
private double WEIGHT_HAPMAP = 120.0;
|
||||||
|
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
|
||||||
|
private double WEIGHT_1000GENOMES = 12.0;
|
||||||
|
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
|
||||||
|
private double WEIGHT_MQ1 = 10.0;
|
||||||
|
|
||||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||||
//private int NUM_KNN = 2000;
|
//private int NUM_KNN = 2000;
|
||||||
|
|
@ -86,9 +92,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
private ExpandingArrayList<String> annotationKeys;
|
private ExpandingArrayList<String> annotationKeys;
|
||||||
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
|
|
||||||
private Set<String> ignoreInputFilterSet = null;
|
private Set<String> ignoreInputFilterSet = null;
|
||||||
private boolean usingDBSNP = false;
|
private int maxAC = 0;
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
@ -103,14 +108,18 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo -- shouldn't be caching these values -- reduces code flexibility
|
boolean foundDBSNP = false;
|
||||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||||
final RMDTrack rod = source.getReferenceOrderedData();
|
final RMDTrack rod = source.getReferenceOrderedData();
|
||||||
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||||
usingDBSNP = true;
|
foundDBSNP = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(!foundDBSNP) {
|
||||||
|
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -141,19 +150,23 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
final VariantDatum variantDatum = new VariantDatum();
|
final VariantDatum variantDatum = new VariantDatum();
|
||||||
variantDatum.annotations = annotationValues;
|
variantDatum.annotations = annotationValues;
|
||||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||||
|
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
|
||||||
// todo this is a bit ugly logic -- why not just look for DBSNP?
|
if( variantDatum.alleleCount > maxAC ) {
|
||||||
boolean isKnown = !vc.getAttribute("ID").equals(".");
|
maxAC = variantDatum.alleleCount;
|
||||||
if(usingDBSNP) {
|
|
||||||
isKnown = false;
|
|
||||||
for( final VariantContext dbsnpVC : tracker.getVariantContexts(ref, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
|
|
||||||
if(dbsnpVC != null && dbsnpVC.isSNP()) {
|
|
||||||
isKnown = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
variantDatum.isKnown = isKnown;
|
|
||||||
|
|
||||||
|
variantDatum.isKnown = false;
|
||||||
|
variantDatum.weight = 1.0;
|
||||||
|
|
||||||
|
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||||
|
if( dbsnp != null ) {
|
||||||
|
variantDatum.isKnown = true;
|
||||||
|
variantDatum.weight = WEIGHT_KNOWNS;
|
||||||
|
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
|
||||||
|
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
|
||||||
|
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
|
||||||
|
}
|
||||||
|
|
||||||
mapList.add( variantDatum );
|
mapList.add( variantDatum );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -191,7 +204,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
VariantGaussianMixtureModel theModel;
|
VariantGaussianMixtureModel theModel;
|
||||||
switch (OPTIMIZATION_MODEL) {
|
switch (OPTIMIZATION_MODEL) {
|
||||||
case GAUSSIAN_MIXTURE_MODEL:
|
case GAUSSIAN_MIXTURE_MODEL:
|
||||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
|
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC );
|
||||||
break;
|
break;
|
||||||
//case K_NEAREST_NEIGHBORS:
|
//case K_NEAREST_NEIGHBORS:
|
||||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||||
|
|
@ -201,10 +214,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
}
|
}
|
||||||
|
|
||||||
theModel.run( CLUSTER_FILENAME );
|
theModel.run( CLUSTER_FILENAME );
|
||||||
|
|
||||||
theModel.outputClusterReports( CLUSTER_FILENAME );
|
theModel.outputClusterReports( CLUSTER_FILENAME );
|
||||||
|
|
||||||
|
|
||||||
for( final String annotation : annotationKeys ) {
|
for( final String annotation : annotationKeys ) {
|
||||||
// Execute Rscript command to plot the optimization curve
|
// Execute Rscript command to plot the optimization curve
|
||||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue