optimizer now uses -an arguments instead of exclude and force for clarity. command-line length reduced by 50%

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3361 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-05-14 15:41:44 +00:00
parent 88bd7a2045
commit 1538dc0144
1 changed files with 15 additions and 34 deletions

View File

@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.*;
/**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
@ -64,10 +61,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
private String[] EXCLUDED_ANNOTATIONS = null;
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
private String[] FORCED_ANNOTATIONS = null;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
@ -90,9 +87,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
/////////////////////////////
// Private Member Variables
/////////////////////////////
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
private boolean firstVariant = true;
private int numAnnotations = 0;
private ExpandingArrayList<String> annotationKeys;
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
private Set<String> ignoreInputFilterSet = null;
private boolean usingDBSNP = false;
@ -104,9 +99,13 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
}
// todo -- shouldn't be caching these values -- reduces code flexibility
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
@ -130,32 +129,12 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
return mapList;
}
double annotationValues[] = new double[numAnnotations];
double annotationValues[] = new double[annotationKeys.size()];
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
annotationKeys.addAll( vc.getAttributes().keySet() );
annotationKeys.add("QUAL");
if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?
if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); }
if( EXCLUDED_ANNOTATIONS != null ) {
for( final String excludedAnnotation : EXCLUDED_ANNOTATIONS ) {
if( annotationKeys.contains( excludedAnnotation ) ) { annotationKeys.remove( excludedAnnotation ); }
}
}
if( FORCED_ANNOTATIONS != null ) {
for( final String forcedAnnotation : FORCED_ANNOTATIONS ) {
if( !annotationKeys.contains( forcedAnnotation ) ) { annotationKeys.add( forcedAnnotation ); }
}
}
numAnnotations = annotationKeys.size();
annotationValues = new double[numAnnotations];
firstVariant = false;
}
int iii = 0;
for( final String key : annotationKeys ) {
@ -180,6 +159,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
final VariantDatum variantDatum = new VariantDatum();
variantDatum.annotations = annotationValues;
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
// todo this is a bit ugly logic -- why not just look for DBSNP?
boolean isKnown = !vc.getAttribute("ID").equals(".");
if(usingDBSNP) {
isKnown = false;
@ -254,7 +235,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
p.waitFor();
} catch (InterruptedException e) {
e.printStackTrace();
System.exit(-1);
throw new StingException(e.getMessage());
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}