GenerateVariantClusters and VariantRecalibrator now uses hapmap and 1kg ROD bindings (in addition to dbsnp) to distinguish between knowns and novels. It no longer looks at by-hapmap validation status so providing hapmap is highly recommended. Example on the wiki. Input variants tracks now must start with input.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4113 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-08-25 18:33:40 +00:00
parent bf0b6bd486
commit 5623e01602
5 changed files with 85 additions and 64 deletions

View File

@ -84,14 +84,12 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
// private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
private double WEIGHT_NOVELS = 0.0;
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for MQ2+ known variants during clustering", required=false)
private double WEIGHT_KNOWNS = 0.0;
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
private double WEIGHT_DBSNP = 0.0;
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for HapMap variants during clustering", required=false)
private double WEIGHT_HAPMAP = 1.0;
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
private double WEIGHT_1000GENOMES = 1.0;
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
private double WEIGHT_MQ1 = 0.0;
@Argument(fullName="weight1KG", shortName="weight1KG", doc="The weight for 1000 Genomes Project variants during clustering", required=false)
private double WEIGHT_1KG = 1.0;
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
private boolean FORCE_INDEPENDENT = false;
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
@ -113,7 +111,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
/////////////////////////////
private ExpandingArrayList<String> annotationKeys;
private Set<String> ignoreInputFilterSet = null;
private int maxAC = 0;
private Set<String> inputNames = new HashSet<String>();
//---------------------------------------------------------------------------------------------------------------
//
@ -131,11 +130,21 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
}
boolean foundDBSNP = false;
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {
inputNames.add(d.getName());
logger.info("Found input variant track with name " + d.getName());
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP);
foundDBSNP = true;
} else if ( d.getName().equals("hapmap") ) {
logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP);
foundDBSNP = true;
} else if ( d.getName().equals("1kg") ) {
logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG);
foundDBSNP = true;
} else {
logger.info("Not evaluating ROD binding " + d.getName());
}
}
@ -160,8 +169,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final double annotationValues[] = new double[annotationKeys.size()];
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
int iii = 0;
for( final String key : annotationKeys ) {
@ -171,23 +180,24 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final VariantDatum variantDatum = new VariantDatum();
variantDatum.annotations = annotationValues;
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
if( variantDatum.alleleCount > maxAC ) {
maxAC = variantDatum.alleleCount;
}
variantDatum.isKnown = false;
variantDatum.weight = WEIGHT_NOVELS;
variantDatum.qual = vc.getPhredScaledQual();
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
if( dbsnp != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_KNOWNS;
final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false);
final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false);
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
if( vcHapMap != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_HAPMAP;
} else if( vc1KG != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_1KG;
} else if( dbsnp != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_DBSNP;
}
if( variantDatum.weight > 0.0 && variantDatum.qual > QUAL_THRESHOLD ) {
@ -220,7 +230,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
logger.info( "There are " + dataManager.numVariants + " variants with >0 clustering weight and " + dataManager.numAnnotations + " annotations." );
logger.info( "The annotations are: " + annotationKeys );
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]

View File

@ -37,5 +37,4 @@ public class VariantDatum {
public boolean isKnown;
public double qual;
public double weight;
public int alleleCount;
}

View File

@ -167,11 +167,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
public final void run( final PrintStream clusterFile ) {
// int numValid = 0;
// int numOutlier = 0;
// int numBadQual = 0;
// int numZeroWeight = 0;
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value
// Filtering based on known status and qual threshold happens in GenerateVariantClusters
for( int iii = 0; iii < dataManager.data.length; iii++ ) {
@ -184,21 +179,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
// logger.info("Clustering with " + data.length + " valid variants.");
// logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
// logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
// logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
generateEmpricalStats( dataManager.data );
logger.info("Initializing using k-means...");
initializeUsingKMeans( dataManager.data );
logger.info("... done!");
createClusters( dataManager.data, 0, maxGaussians, clusterFile );
// Simply cluster with all the variants. The knowns have been given more weight than the novels
//logger.info("Clustering with " + dataManager.data.length + " variants.");
//createClusters( dataManager.data, 0, numGaussians, clusterFileName );
}
private void generateEmpricalStats( final VariantDatum[] data ) {

View File

@ -90,17 +90,19 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="hapmap_prior", shortName="hapmapPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
private double HAPMAP_QUAL_PRIOR = 15.0;
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
private double KNOWN_QUAL_PRIOR = 10.0;
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private double NOVEL_QUAL_PRIOR = 2.0;
@Argument(fullName="priorNovels", shortName="priorNovels", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private double PRIOR_NOVELS = 2.0;
@Argument(fullName="priorDBSNP", shortName="priorDBSNP", doc="A prior on the quality of dbSNP variants, a phred scaled probability of being true.", required=false)
private double PRIOR_DBSNP = 10.0;
@Argument(fullName="priorHapMap", shortName="priorHapMap", doc="A prior on the quality of HapMap variants, a phred scaled probability of being true.", required=false)
private double PRIOR_HAPMAP = 15.0;
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
private double PRIOR_1KG = 12.0;
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private Double[] FDR_TRANCHES = null;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
private double QUAL_STEP = 0.1;
@ -115,6 +117,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
private VariantGaussianMixtureModel theModel = null;
private Set<String> ignoreInputFilterSet = null;
private Set<String> inputNames = new HashSet<String>();
//---------------------------------------------------------------------------------------------------------------
//
@ -146,7 +149,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
final TreeSet<String> samples = new TreeSet<String>();
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
@ -156,10 +158,21 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
vcfWriter.writeHeader(vcfHeader);
boolean foundDBSNP = false;
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {
inputNames.add(d.getName());
logger.info("Found input variant track with name " + d.getName());
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
foundDBSNP = true;
} else if ( d.getName().equals("hapmap") ) {
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
foundDBSNP = true;
} else if ( d.getName().equals("1kg") ) {
logger.info("Found 1KG track for with prior probability = Q" + PRIOR_1KG);
foundDBSNP = true;
} else {
logger.info("Not evaluating ROD binding " + d.getName());
}
}
@ -191,21 +204,33 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
return mapList;
}
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
variantDatum.isKnown = dbsnp != null;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?
final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false);
final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false);
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
double knownPrior = variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR);
if(variantDatum.isKnown && DbSNPHelper.isHapmap( dbsnp ) ) {
knownPrior = QualityUtils.qualToProb(HAPMAP_QUAL_PRIOR);
variantDatum.isKnown = false;
double knownPrior_qScore = PRIOR_NOVELS;
if( vcHapMap != null ) {
variantDatum.isKnown = true;
knownPrior_qScore = PRIOR_HAPMAP;
} else if( vc1KG != null ) {
variantDatum.isKnown = true;
knownPrior_qScore = PRIOR_1KG;
} else if( dbsnp != null ) {
variantDatum.isKnown = true;
knownPrior_qScore = PRIOR_DBSNP;
}
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
final int alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?
final double acPrior = theModel.getAlleleCountPrior( alleleCount );
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {

View File

@ -14,7 +14,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e6724946c3298b3d74bb1ba1396a9190" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "79727fb46bb60be30eae2666e4f792b6" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -28,6 +28,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -L 1:1-100,000,000" +
" --ignore_filter GATK_STANDARD" +
" -an QD -an HRun -an SB" +
" -weightDBSNP 1.0" +
" -clusterFile %s",
1, // just one output file
Arrays.asList(md5));
@ -39,7 +40,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b97ab64b86ce8c8698855058d32853ce" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "446f0de46f59344ec5579d0c8b54ee66" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -61,7 +62,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -tranchesFile %s" +
" -reportDatFile %s",
3, // two output file
Arrays.asList(md5, "f603aa2052ae6e81a6bde63a8a3f9539","951a17f9c11de391763b9a8cb239205a"));
Arrays.asList(md5, "d979864c46ae237f52f1002f0ec19b16","ab438d03e276a053d7534056f62c83f3"));
List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
@ -74,7 +75,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testApplyVariantCuts() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e80f26d68be2b183fd7f062039cef28a" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b25c2a563362897271bbee9db3f9c3af" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();