From ca2a0266dc9e82a97cfb74c4b4d761749744286c Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 8 Mar 2010 14:04:33 +0000 Subject: [PATCH] Converting annotation values that are set to Double.Infinity git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2953 348d0f76-0448-11de-a6fe-93d51630548a --- .../VariantGaussianMixtureModel.java | 12 ++++++------ .../walkers/variantoptimizer/VariantOptimizer.java | 13 ++++++++++++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index 30717a1a7..f39bb7b09 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -68,23 +68,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel public final void run( final String outputPrefix ) { // Create the subset of the data to cluster with - int numSubset = 0; + int numNovel = 0; for( final VariantDatum datum : dataManager.data ) { if( !datum.isKnown ) { - numSubset++; + numNovel++; } } VariantDatum[] data; - if( numSubset * 2 * 1.3 < dataManager.numVariants ) { - data = new VariantDatum[numSubset*2]; + if( numNovel * 2 * 1.3 < dataManager.numVariants ) { + data = new VariantDatum[numNovel*2]; int iii = 0; for( final VariantDatum datum : dataManager.data ) { if( !datum.isKnown ) { data[iii++] = datum; } } - while( iii < numSubset*2 ) { // grab an equal number of known variants at random + while( iii < numNovel*2 ) { // grab an equal number of known variants at random final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; if( datum.isKnown ) { data[iii++] = datum; @@ -94,7 +94,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel data = dataManager.data; } - System.out.println("Clustering with " + data.length + " variants..."); + System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants..."); if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2*numNovel is so large compared to the full set) "); } createClusters( data ); // Using a subset of the data System.out.println("Printing out cluster parameters..."); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index ad8b57093..c4ce9adf2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -57,6 +57,8 @@ public class VariantOptimizer extends RodWalker private boolean IGNORE_INPUT_FILTERS = false; @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false) private String[] EXCLUDED_ANNOTATIONS = null; + @Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false) + private String[] FORCED_ANNOTATIONS = null; @Argument(fullName="output", shortName="output", doc="The output file name", required=false) private String OUTPUT_FILE = "optimizer.data"; @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian mixture model", required=false) @@ -71,6 +73,7 @@ public class VariantOptimizer extends RodWalker private final ExpandingArrayList annotationKeys = new ExpandingArrayList(); private boolean firstVariant = true; private int numAnnotations = 0; + private static final double INFINITE_ANNOTATION_VALUE = 10000.0; //--------------------------------------------------------------------------------------------------------------- // @@ -110,6 +113,11 @@ public class VariantOptimizer extends RodWalker if( annotationKeys.contains( excludedAnnotation ) ) { annotationKeys.remove( excludedAnnotation ); } } } + if( FORCED_ANNOTATIONS != null ) { + for( final String forcedAnnotation : FORCED_ANNOTATIONS ) { + if( !annotationKeys.contains( forcedAnnotation ) ) { annotationKeys.add( forcedAnnotation ); } + } + } numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL") annotationValues = new double[numAnnotations]; firstVariant = false; @@ -121,6 +129,9 @@ public class VariantOptimizer extends RodWalker double value = 0.0; try { value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) ); + if( Double.isInfinite(value) ) { + value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE; + } } catch( NumberFormatException e ) { // do nothing, default value is 0.0, } @@ -166,7 +177,7 @@ public class VariantOptimizer extends RodWalker logger.info( "The annotations are: " + annotationKeys + " and QUAL." ); dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] - + // Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it final VariantOptimizationModel gmm = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS ); gmm.run( OUTPUT_FILE );