diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index 35dc5bc0d..0e7bdc21a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -84,14 +84,12 @@ public class GenerateVariantClustersWalker extends RodWalker annotationKeys; private Set ignoreInputFilterSet = null; - private int maxAC = 0; + private Set inputNames = new HashSet(); + //--------------------------------------------------------------------------------------------------------------- // @@ -131,11 +130,21 @@ public class GenerateVariantClustersWalker extends RodWalker dataSources = this.getToolkit().getRodDataSources(); - for( final ReferenceOrderedDataSource source : dataSources ) { - final RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { + if( d.getName().startsWith("input") ) { + inputNames.add(d.getName()); + logger.info("Found input variant track with name " + d.getName()); + } else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP); foundDBSNP = true; + } else if ( d.getName().equals("hapmap") ) { + logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP); + foundDBSNP = true; + } else if ( d.getName().equals("1kg") ) { + logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG); + foundDBSNP = true; + } else { + logger.info("Not evaluating ROD binding " + d.getName()); } } @@ -160,8 +169,8 @@ public class GenerateVariantClustersWalker extends RodWalker maxAC ) { - maxAC = variantDatum.alleleCount; - } variantDatum.isKnown = false; variantDatum.weight = WEIGHT_NOVELS; variantDatum.qual = vc.getPhredScaledQual(); final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - if( dbsnp != null ) { - variantDatum.isKnown = true; - variantDatum.weight = WEIGHT_KNOWNS; + final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false); + final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false); - if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; } - else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; } - else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; } + if( vcHapMap != null ) { + variantDatum.isKnown = true; + variantDatum.weight = WEIGHT_HAPMAP; + } else if( vc1KG != null ) { + variantDatum.isKnown = true; + variantDatum.weight = WEIGHT_1KG; + } else if( dbsnp != null ) { + variantDatum.isKnown = true; + variantDatum.weight = WEIGHT_DBSNP; } if( variantDatum.weight > 0.0 && variantDatum.qual > QUAL_THRESHOLD ) { @@ -220,7 +230,7 @@ public class GenerateVariantClustersWalker extends RodWalker0 clustering weight and " + dataManager.numAnnotations + " annotations." ); logger.info( "The annotations are: " + annotationKeys ); dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index 117a50deb..b12aec37b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -37,5 +37,4 @@ public class VariantDatum { public boolean isKnown; public double qual; public double weight; - public int alleleCount; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java index dc30df25c..7558b35db 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java @@ -167,11 +167,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel public final void run( final PrintStream clusterFile ) { -// int numValid = 0; -// int numOutlier = 0; -// int numBadQual = 0; -// int numZeroWeight = 0; - // Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value // Filtering based on known status and qual threshold happens in GenerateVariantClusters for( int iii = 0; iii < dataManager.data.length; iii++ ) { @@ -184,21 +179,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } -// logger.info("Clustering with " + data.length + " valid variants."); -// logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight."); -// logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value."); -// logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ")."); - generateEmpricalStats( dataManager.data ); logger.info("Initializing using k-means..."); initializeUsingKMeans( dataManager.data ); logger.info("... done!"); createClusters( dataManager.data, 0, maxGaussians, clusterFile ); - - // Simply cluster with all the variants. The knowns have been given more weight than the novels - //logger.info("Clustering with " + dataManager.data.length + " variants."); - //createClusters( dataManager.data, 0, numGaussians, clusterFileName ); } private void generateEmpricalStats( final VariantDatum[] data ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 6bfd3b984..2573bb6e1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -90,17 +90,19 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = null; + private Set inputNames = new HashSet(); //--------------------------------------------------------------------------------------------------------------- // @@ -146,7 +149,6 @@ public class VariantRecalibrator extends RodWalker hInfo = new HashSet(); final TreeSet samples = new TreeSet(); - final List dataSources = this.getToolkit().getRodDataSources(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score")); hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); @@ -156,10 +158,21 @@ public class VariantRecalibrator extends RodWalker e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e6724946c3298b3d74bb1ba1396a9190" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "79727fb46bb60be30eae2666e4f792b6" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -28,6 +28,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 1:1-100,000,000" + " --ignore_filter GATK_STANDARD" + " -an QD -an HRun -an SB" + + " -weightDBSNP 1.0" + " -clusterFile %s", 1, // just one output file Arrays.asList(md5)); @@ -39,7 +40,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testVariantRecalibrator() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b97ab64b86ce8c8698855058d32853ce" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "446f0de46f59344ec5579d0c8b54ee66" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -61,7 +62,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile %s" + " -reportDatFile %s", 3, // two output file - Arrays.asList(md5, "f603aa2052ae6e81a6bde63a8a3f9539","951a17f9c11de391763b9a8cb239205a")); + Arrays.asList(md5, "d979864c46ae237f52f1002f0ec19b16","ab438d03e276a053d7534056f62c83f3")); List result = executeTest("testVariantRecalibrator", spec).getFirst(); inputVCFFiles.put(vcf, result.get(0).getAbsolutePath()); tranchesFiles.put(vcf, result.get(1).getAbsolutePath()); @@ -74,7 +75,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testApplyVariantCuts() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e80f26d68be2b183fd7f062039cef28a" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b25c2a563362897271bbee9db3f9c3af" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey();