optimizer now uses -an arguments instead of exclude and force for clarity. command-line length reduced by 50%

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3361 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-14 15:41:44 +00:00 · 2010-05-14 15:41:44 +00:00 · 1538dc0144
parent 88bd7a2045
commit 1538dc0144
1 changed files with 15 additions and 34 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.commandline.Argument;

 import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
+import java.util.*;

 /**
 * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
@ -64,10 +61,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    private boolean IGNORE_ALL_INPUT_FILTERS = false;
    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
    private String[] IGNORE_INPUT_FILTERS = null;
-    @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
-    private String[] EXCLUDED_ANNOTATIONS = null;
-    @Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
-    private String[] FORCED_ANNOTATIONS = null;
+
+    @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
+    private String[] USE_ANNOTATIONS = null;
+
    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
@ -90,9 +87,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    /////////////////////////////
    // Private Member Variables
    /////////////////////////////
-    private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
-    private boolean firstVariant = true;
-    private int numAnnotations = 0;
+    private ExpandingArrayList<String> annotationKeys;
    private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
    private Set<String> ignoreInputFilterSet = null;
    private boolean usingDBSNP = false;
@ -104,9 +99,13 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    //---------------------------------------------------------------------------------------------------------------

    public void initialize() {
+        annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
+
        if( IGNORE_INPUT_FILTERS != null ) {
            ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
        }
+
+        // todo -- shouldn't be caching these values -- reduces code flexibility
        final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
        for( final ReferenceOrderedDataSource source : dataSources ) {
            final RMDTrack rod = source.getReferenceOrderedData();
@ -130,32 +129,12 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
            return mapList;
        }

-        double annotationValues[] = new double[numAnnotations];
+        double annotationValues[] = new double[annotationKeys.size()];

+        // todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
        for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
-
            if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
                if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
-                    if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
-                        annotationKeys.addAll( vc.getAttributes().keySet() );
-                        annotationKeys.add("QUAL");
-                        if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?
-                        if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); }
-                        if( EXCLUDED_ANNOTATIONS != null ) {
-                            for( final String excludedAnnotation : EXCLUDED_ANNOTATIONS ) {
-                                if( annotationKeys.contains( excludedAnnotation ) ) { annotationKeys.remove( excludedAnnotation ); }
-                            }
-                        }
-                        if( FORCED_ANNOTATIONS != null ) {
-                            for( final String forcedAnnotation : FORCED_ANNOTATIONS ) {
-                                if( !annotationKeys.contains( forcedAnnotation ) ) { annotationKeys.add( forcedAnnotation ); }
-                            }
-                        }
-                        numAnnotations = annotationKeys.size();
-                        annotationValues = new double[numAnnotations];
-                        firstVariant = false;
-                    }
-
                    int iii = 0;
                    for( final String key : annotationKeys ) {

@ -180,6 +159,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                    final VariantDatum variantDatum = new VariantDatum();
                    variantDatum.annotations = annotationValues;
                    variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
+
+                    // todo this is a bit ugly logic -- why not just look for DBSNP?
                    boolean isKnown = !vc.getAttribute("ID").equals(".");
                    if(usingDBSNP) {
                        isKnown = false;
@ -254,7 +235,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                p.waitFor();
            } catch (InterruptedException e) {
                e.printStackTrace();
-                System.exit(-1);
+                throw new StingException(e.getMessage());
            } catch ( IOException e ) {
                throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
            }