diff --git a/ivy.xml b/ivy.xml index f76880b94..5a8c3986a 100644 --- a/ivy.xml +++ b/ivy.xml @@ -97,7 +97,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 94ed23caf..c201e95f0 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -289,7 +289,7 @@ public abstract class ArgumentTypeDescriptor { return field.isAnnotationPresent(Hidden.class); } - public Class makeRawTypeIfNecessary(Type t) { + public static Class makeRawTypeIfNecessary(Type t) { if ( t == null ) return null; else if ( t instanceof ParameterizedType ) @@ -300,6 +300,115 @@ public abstract class ArgumentTypeDescriptor { throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); } } + + /** + * The actual argument parsing method. + * @param source source + * @param type type to check + * @param matches matches + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + String value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + return parseBinding(value, parameterType, type, name, tags, source.field.getName()); + } + + /** + * + * @param value The source of the binding + * @param parameterType The Tribble Feature parameter type + * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. + * @param bindingName The name of the binding passed to the constructor. + * @param tags Tags for the binding used for parsing and passed to the constructor. + * @param fieldName The name of the field that was parsed. Used for error reporting. + * @return The newly created binding object of type bindingClass. + */ + public static Object parseBinding(String value, Class parameterType, Type bindingClass, + String bindingName, Tags tags, String fieldName) { + try { + String tribbleType = null; + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only support -X:type and -X:name,type argument styles", + value, fieldName)); + } else if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + bindingName = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + + FeatureManager manager = new FeatureManager(); + if ( manager.getByName(tribbleType) == null ) + throw new UserException.UnknownTribbleType( + tribbleType, + String.format("Unable to find tribble type '%s' provided on the command line. " + + "Please select a correct type from among the supported types:%n%s", + tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); + + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + bindingName = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = new File(value); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.info("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) { + // IntervalBinding can be created from a normal String + Class rawType = (makeRawTypeIfNecessary(bindingClass)); + try { + return rawType.getConstructor(String.class).newInstance(value); + } catch (NoSuchMethodException e) { + /* ignore */ + } + + if ( ! file.exists() ) { + throw new UserException.CouldNotReadInputFile(file, "file does not exist"); + } else if ( ! file.canRead() || ! file.isFile() ) { + throw new UserException.CouldNotReadInputFile(file, "file could not be read"); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags); + } catch (Exception e) { + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, fieldName, e.getMessage())); + } + } } /** @@ -324,6 +433,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } @Override + @SuppressWarnings("unchecked") public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { Class parameterType = JVMUtils.getParameterizedTypeClass(type); return RodBinding.makeUnbound((Class)parameterType); @@ -336,118 +446,16 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return parse(parsingEngine, source, type, matches, false); - } - - /** - * The actual argument parsing method. - * - * IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided - * that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of - * this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding - * is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the - * usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other - * side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically - * continue to input them out of order (whereas Tribble Features are ordered). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding? - * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. - */ - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) { - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - String value = getArgumentValue( defaultDefinition, matches ); - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - - try { - String name = defaultDefinition.fullName; - String tribbleType = null; - Tags tags = getArgumentTags(matches); - // must have one or two tag values here - if ( tags.getPositionalTags().size() > 2 ) { - throw new UserException.CommandLineException( - String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only support -X:type and -X:name,type argument styles", - value, source.field.getName())); - } if ( tags.getPositionalTags().size() == 2 ) { - // -X:name,type style - name = tags.getPositionalTags().get(0); - tribbleType = tags.getPositionalTags().get(1); - } else { - // case with 0 or 1 positional tags - FeatureManager manager = new FeatureManager(); - - // -X:type style is a type when we cannot determine the type dynamically - String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; - if ( tag1 != null ) { - if ( manager.getByName(tag1) != null ) // this a type - tribbleType = tag1; - else - name = tag1; - } - - if ( tribbleType == null ) { - // try to determine the file type dynamically - File file = new File(value); - if ( file.canRead() && file.isFile() ) { - FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); - if ( featureDescriptor != null ) { - tribbleType = featureDescriptor.getName(); - logger.info("Dynamically determined type of " + file + " to be " + tribbleType); - } - } - - if ( tribbleType == null ) { - // IntervalBindings allow streaming conversion of Strings - if ( createIntervalBinding ) { - return new IntervalBinding(value); - } - - if ( ! file.exists() ) { - throw new UserException.CouldNotReadInputFile(file, "file does not exist"); - } else if ( ! file.canRead() || ! file.isFile() ) { - throw new UserException.CouldNotReadInputFile(file, "file could not be read"); - } else { - throw new UserException.CommandLineException( - String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + - "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", - manager.userFriendlyListOfAvailableFeatures(parameterType))); - } - } - } - } - - Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - Object result; - if ( createIntervalBinding ) { - result = ctor.newInstance(parameterType, name, value, tribbleType, tags); - } else { - RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); - parsingEngine.addTags(rbind, tags); - parsingEngine.addRodBinding(rbind); - result = rbind; - } - return result; - } catch (InvocationTargetException e) { - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s.", - value, source.field.getName())); - } catch (Exception e) { - if ( e instanceof UserException ) - throw ((UserException)e); - else - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, source.field.getName(), e.getMessage())); - } + Tags tags = getArgumentTags(matches); + RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + return rbind; } } /** - * Parser for RodBinding objects + * Parser for IntervalBinding objects */ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { /** @@ -475,7 +483,7 @@ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { */ @Override public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true); + return parseBinding(source, type, matches, getArgumentTags(matches)); } } @@ -783,7 +791,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { } Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); - Constructor multiplexerConstructor = null; + Constructor multiplexerConstructor; try { multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); multiplexerConstructor.setAccessible(true); @@ -792,7 +800,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); } - Multiplexer multiplexer = null; + Multiplexer multiplexer; try { multiplexer = multiplexerConstructor.newInstance(sourceValues); } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java index 452309e89..26af49e12 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java @@ -78,24 +78,7 @@ public abstract class ParsingMethod { String argument = matcher.group(1).trim(); - Tags tags = new Tags(); - if(matcher.group(2) != null) { - for(String tag: Utils.split(matcher.group(2),",")) { - // Check for presence of an '=' sign, indicating a key-value pair in the tag line. - int equalDelimiterPos = tag.indexOf('='); - if(equalDelimiterPos >= 0) { - // Sanity check; ensure that there aren't multiple '=' in this key-value pair. - if(tag.indexOf('=',equalDelimiterPos+1) >= 0) - throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " + - "key-value tags are of the form =, and neither key " + - "nor value contain the '=' character", tag, argument)); - tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1)); - } - else - tags.addPositionalTag(tag); - - } - } + Tags tags = parseTags(argument, matcher.group(2)); // Find the most appropriate argument definition for the given argument. ArgumentDefinition argumentDefinition = definitions.findArgumentDefinition( argument, definitionMatcher ); @@ -105,6 +88,28 @@ public abstract class ParsingMethod { return new ArgumentMatch(argument,argumentDefinition,position,tags); } + public static Tags parseTags(String argument, String tagString) { + Tags tags = new Tags(); + if (tagString != null) { + for(String tag: Utils.split(tagString, ",")) { + // Check for presence of an '=' sign, indicating a key-value pair in the tag line. + int equalDelimiterPos = tag.indexOf('='); + if(equalDelimiterPos >= 0) { + // Sanity check; ensure that there aren't multiple '=' in this key-value pair. + if(tag.indexOf('=',equalDelimiterPos+1) >= 0) + throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " + + "key-value tags are of the form =, and neither key " + + "nor value contain the '=' character", tag, argument)); + tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1)); + } + else + tags.addPositionalTag(tag); + + } + } + return tags; + } + /** * A command-line argument always starts with an alphabetical character or underscore followed by any word character. */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 6fa70f437..68680dd10 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; -import org.broad.tribble.Feature; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; @@ -54,9 +53,9 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -582,7 +581,6 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ protected void initializeIntervals() { - // return if no interval arguments at all if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) return; @@ -590,17 +588,22 @@ public class GenomeAnalysisEngine { // Note that the use of '-L all' is no longer supported. // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, argCollection.intervalSetRule, argCollection.intervalPadding)); + + Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + this.referenceDataSource, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); // if no exclude arguments, can return parseIntervalArguments directly - if ( argCollection.excludeIntervals == null ) + if ( excludeSortedSet == null ) intervals = includeSortedSet; // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION); intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given @@ -613,43 +616,6 @@ public class GenomeAnalysisEngine { } } - /** - * Loads the intervals relevant to the current execution - * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above - * @param rule interval merging rule - * @return A sorted, merged list of all intervals specified in this arg list. - */ - protected GenomeLocSortedSet loadIntervals( final List> argList, final IntervalSetRule rule ) { - return loadIntervals(argList, rule, 0); - } - - /** - * Loads the intervals relevant to the current execution - * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above - * @param rule interval merging rule - * @param padding how much to pad the intervals - * @return A sorted, merged list of all intervals specified in this arg list. - */ - protected GenomeLocSortedSet loadIntervals( final List> argList, final IntervalSetRule rule, final int padding ) { - - List allIntervals = new ArrayList(); - for ( IntervalBinding intervalBinding : argList ) { - List intervals = intervalBinding.getIntervals(this.getGenomeLocParser()); - - if ( intervals.isEmpty() ) { - logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); - } - - if ( padding > 0 ) { - intervals = IntervalUtils.getIntervalsWithFlanks(this.getGenomeLocParser(), intervals, padding); - } - - allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule); - } - - return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); - } - /** * Add additional, externally managed IO streams for inputs. * @@ -830,7 +796,8 @@ public class GenomeAnalysisEngine { throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e); } } - RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser,header,validationExclusionType); + + RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType); List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) @@ -854,6 +821,15 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(); } + public boolean lenientVCFProcessing() { + return lenientVCFProcessing(argCollection.unsafe); + } + + public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { + return val == ValidationExclusion.TYPE.ALL + || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; + } + /** * Returns the unmerged SAM file header for an individual reader. * @param reader The reader. diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index babbb7ab8..13c737a2e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -347,9 +347,6 @@ public class GATKArgumentCollection { public boolean USE_SLOW_GENOTYPES = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - @Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false) - public boolean allowMissingVCFHeaders = false; - /** * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java index 577f7929a..52c77326a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java @@ -40,6 +40,7 @@ public class ValidationExclusion { ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities + LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL ALL // do not check for all of the above conditions, DEFAULT } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index d0fdae639..fb05a6b04 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -107,8 +107,10 @@ public class VariantContextWriterStorage implements Storage, Var List options = new ArrayList(); if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); - if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); + if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index 3f03b30dd..b5d5deedb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -85,16 +86,18 @@ public class FeatureManager { private final PluginManager pluginManager; private final Collection featureDescriptors = new TreeSet(); private final VCFHeader headerForRepairs; + private final boolean lenientVCFProcessing; /** * Construct a FeatureManager without a master VCF header */ public FeatureManager() { - this(null); + this(null, false); } - public FeatureManager(final VCFHeader headerForRepairs) { + public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) { this.headerForRepairs = headerForRepairs; + this.lenientVCFProcessing = lenientVCFProcessing; pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); for (final String rawName: pluginManager.getPluginsByName().keySet()) { @@ -252,8 +255,11 @@ public class FeatureManager { ((NameAwareCodec)codex).setName(name); if ( codex instanceof ReferenceDependentFeatureCodec ) ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - if ( codex instanceof VCFCodec) + if ( codex instanceof VCFCodec ) ((VCFCodec)codex).setHeaderForRepairs(headerForRepairs); + if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing ) + ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); + return codex; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 25e005601..e183fe169 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -34,6 +34,7 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; @@ -98,7 +99,7 @@ public class RMDTrackBuilder { // extends PluginManager { this.dict = dict; this.validationExclusionType = validationExclusionType; this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(headerForRepairs); + this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index d91ddd221..01fa92b8c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -71,11 +71,13 @@ public class BQSRGatherer extends Gatherer { if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { File recal_out = new File(output.getName() + ".original"); RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + // TODO -- fix me + //RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); } else if (!RAC.NO_PLOTS) { File recal_out = new File(output.getName() + ".recal"); - RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + // TODO -- fix me + //RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java deleted file mode 100644 index 29eecfbb1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ /dev/null @@ -1,329 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; - -/** - * This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR - * - * It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will - * add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information - * out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type. - * - * The keys represented by this key manager will always have the same order: - * - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType - * ... - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType - * - * - * Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary. - * - * @author Mauricio Carneiro - * @since 3/6/12 - */ -public class BQSRKeyManager { - - private final Covariate[] requiredCovariates; - private final Covariate[] optionalCovariates; - private final RequiredCovariateInfo[] requiredCovariatesInfo; - private final OptionalCovariateInfo[] optionalCovariatesInfo; - private final Map covariateNameToIDMap; - - private int nRequiredBits; // Number of bits used to represent the required covariates - - private final int optionalCovariateOffset; - private final int optionalCovariateIDOffset; - - private final long optionalCovariateMask; // Standard mask for optional covariates key - private final long optionalCovariateIDMask; // Standard mask for optional covariates order key - private final long eventIDMask; // Standard mask for event ID - - /** - * Initializes the KeyManager with the total number of covariates to use - * - * @param requiredCovariates the ordered list of required covariates - * @param optionalCovariates the ordered list of optional covariates - */ - public BQSRKeyManager(final List requiredCovariates, final List optionalCovariates) { - this.requiredCovariates = new Covariate[requiredCovariates.size()]; - this.optionalCovariates = new Covariate[optionalCovariates.size()]; - requiredCovariatesInfo = new RequiredCovariateInfo[requiredCovariates.size()]; // initialize the required covariates list - optionalCovariatesInfo = new OptionalCovariateInfo[optionalCovariates.size()]; // initialize the optional covariates list (size may be 0, it's okay) - covariateNameToIDMap = new HashMap(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) - - nRequiredBits = 0; - for (int i = 0; i < requiredCovariates.size(); i++) { // create a list of required covariates with the extra information for key management - final Covariate required = requiredCovariates.get(i); - final int nBits = required.numberOfBits(); // number of bits used by this covariate - final long mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate - this.requiredCovariates[i] = required; - requiredCovariatesInfo[i] = new RequiredCovariateInfo(nBits, nRequiredBits, mask, required); // Create an object for this required covariate - nRequiredBits += nBits; - } - - final int bitsInEventType = numberOfBitsToRepresent(EventType.values().length); - eventIDMask = genericMask(nRequiredBits, bitsInEventType); - - short id = 0; - int nOptionalBits = 0; - for (int i = 0; i < optionalCovariates.size(); i++) { - final Covariate optional = optionalCovariates.get(i); - nOptionalBits = Math.max(nOptionalBits, optional.numberOfBits()); // optional covariates are represented by the number of bits needed by biggest covariate - this.optionalCovariates[i] = optional; - optionalCovariatesInfo[i] = new OptionalCovariateInfo(id, optional); - final String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport - covariateNameToIDMap.put(covariateName, id); - id++; - } - - optionalCovariateOffset = nRequiredBits + bitsInEventType; - optionalCovariateMask = genericMask(optionalCovariateOffset, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset - optionalCovariateIDOffset = nRequiredBits + bitsInEventType + nOptionalBits; - final int nOptionalIDBits = numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID - optionalCovariateIDMask = genericMask(optionalCovariateIDOffset, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset - - final int totalNumberOfBits = optionalCovariateIDOffset + nOptionalIDBits; // total number of bits used in the final key - if ( totalNumberOfBits > 64 ) - throw new UserException.BadInput("The total number of bits used for the master BQSR key is greater than 64 and cannot be represented in a long"); - } - - /** - * Generates one key given the optional covariate (or none if it is null) - * - * Keys include all required covariates, the standard covariate and the event type. - * - * @param allKeys The keys in long representation for each covariate (includes all optional covariates, not just the one requested) - * @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions) - * @return one key in long representation (non-negative) or -1 for a bad key - */ - public long createMasterKey(final long[] allKeys, final EventType eventType, final int optionalCovariateIndex) { - - int keyIndex = 0; - long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on - for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) - masterKey |= (allKeys[keyIndex++] << infoRequired.offset); - - final long eventKey = keyFromEvent(eventType); // create a key for the event type - masterKey |= (eventKey << nRequiredBits); - - if (optionalCovariateIndex >= 0 && optionalCovariateIndex < optionalCovariates.length) { - final long covariateKey = allKeys[keyIndex + optionalCovariateIndex]; - if (covariateKey < 0) // do not add "nulls" to the final set of keys - return -1; - - masterKey |= (covariateKey << optionalCovariateOffset); - masterKey |= (optionalCovariatesInfo[optionalCovariateIndex].covariateID << optionalCovariateIDOffset); - } - - return masterKey; - } - - /** - * Generates one key for the covariates represented in Object[] key - * - * The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file) - * and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one key, not many. - * - * Example key: - * RG, QUAL, CYCLE, CYCLE_ID, EventType - * - * @param key list of objects produced by the required covariates followed by one or zero optional covariates. - * @return a key representing these objects. - */ - public long longFromKey(Object[] key) { - int requiredCovariate = 0; - long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on - for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) - masterKey |= (infoRequired.covariate.longFromKey(key[requiredCovariate++]) << infoRequired.offset); - - final int eventIndex = key.length - 1; // the event type is always the last key - final long eventKey = keyFromEvent((EventType) key[eventIndex]); // create a key for the event type - masterKey |= (eventKey << nRequiredBits); - - if (optionalCovariatesInfo.length > 0) { - final int covariateIndex = requiredCovariatesInfo.length; // the optional covariate index in the key array - final int covariateIDIndex = covariateIndex + 1; // the optional covariate ID index is right after the optional covariate's - final short covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index - final OptionalCovariateInfo infoOptional = optionalCovariatesInfo[covariateID]; // so we can get the optional covariate information - - final long covariateKey = infoOptional.covariate.longFromKey(key[covariateIndex]); // convert the optional covariate key into a bitset using the covariate's interface - masterKey |= (covariateKey << optionalCovariateOffset); - masterKey |= (infoOptional.covariateID << optionalCovariateIDOffset); - } - - return masterKey; - } - - /** - * Covariate id can be either the covariate name (String) or the actual id (short). This method - * finds it's type and converts accordingly to the short notation. - * - * @param id the string or short representation of the optional covariate id - * @return the short representation of the optional covariate id. - */ - private short parseCovariateID(final Object id) { - return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id; - } - - /** - * Generates a key set of objects from a combined master key. - * - * Masks out each covariate independently and decodes their values (Object) into a keyset - * - * @param master the master representation of the keys - * @return an object array with the values for each key - */ - public List keySetFrom(final long master) { - final List objectKeys = new ArrayList(); - for (RequiredCovariateInfo info : requiredCovariatesInfo) { - final long covariateKey = extractKeyFromMaster(master, info.mask, info.offset); // get the covariate's key - objectKeys.add(info.covariate.formatKey(covariateKey)); // convert the key to object using covariate's interface - } - - if (optionalCovariatesInfo.length > 0) { - final long covKey = extractKeyFromMaster(master, optionalCovariateMask, optionalCovariateOffset); // get the covariate's key - final int covIDKey = (int)extractKeyFromMaster(master, optionalCovariateIDMask, optionalCovariateIDOffset); // get the covariate's id (to identify which covariate this is) - Covariate covariate = optionalCovariatesInfo[(short)covIDKey].covariate; // get the corresponding optional covariate object - objectKeys.add(covariate.formatKey(covKey)); // add the optional covariate key to the key set - objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id - } - - objectKeys.add(EventType.eventFrom((int)extractKeyFromMaster(master, eventIDMask, nRequiredBits))); // add the event type object to the key set - - return objectKeys; - } - - public Covariate[] getRequiredCovariates() { - return requiredCovariates; - } - - public Covariate[] getOptionalCovariates() { - return optionalCovariates; - } - - public int getNumRequiredCovariates() { - return requiredCovariates.length; - } - - public int getNumOptionalCovariates() { - return optionalCovariates.length; - } - - /** - * Creates a mask for the requested covariate to extract the relevant key from a combined master key - * - * @param offset the offset into the master key - * @param nBits the number of bits needed by the Covariate to represent its values - * @return the mask relevant to the covariate - */ - private long genericMask(final int offset, final int nBits) { - long mask = 0L; - for ( int i = 0; i < nBits; i++ ) - mask |= 1L << (offset+i); - return mask; - } - - private long extractKeyFromMaster(final long master, final long mask, final int offset) { - long key = master & mask; - return key >> offset; - } - - // cache the key representing an event since it's otherwise created a massive amount of times - private static final long[] eventTypeCache = new long[EventType.values().length]; // event IDs must be longs so that bit-fiddling works - static { - for (final EventType eventType : EventType.values()) - eventTypeCache[eventType.index] = (long)eventType.index; - } - - private long keyFromEvent(final EventType eventType) { - return eventTypeCache[eventType.index]; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof BQSRKeyManager)) - return false; - - BQSRKeyManager other = (BQSRKeyManager) o; - if (this == other) - return true; - - if (requiredCovariatesInfo.length != other.requiredCovariatesInfo.length || - optionalCovariatesInfo.length != other.optionalCovariatesInfo.length) - return false; - - for (int i = 0; i < requiredCovariates.length; i++) { - Covariate myRequiredCovariate = requiredCovariates[i]; - Covariate otherRequiredCovariate = other.requiredCovariates[i]; - String thisName = myRequiredCovariate.getClass().getSimpleName(); - String otherName = otherRequiredCovariate.getClass().getSimpleName(); - if (!thisName.equals(otherName)) - return false; - } - - for (int i = 0; i < optionalCovariates.length; i++) { - Covariate myOptionalCovariate = optionalCovariates[i]; - Covariate otherOptionalCovariate = other.optionalCovariates[i]; - String thisName = myOptionalCovariate.getClass().getSimpleName(); - String otherName = otherOptionalCovariate.getClass().getSimpleName(); - if (!thisName.equals(otherName)) - return false; - } - - return true; - } - - /** - * Calculates the number of bits necessary to represent a given number of elements - * - * @param numberOfElements the number of elements to represent (must be positive) - * @return the number of bits necessary to represent this many elements - */ - public static int numberOfBitsToRepresent(long numberOfElements) { - if (numberOfElements < 0) - throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements); - - if (numberOfElements == 1L) - return 1; // special case - - int n = 0; - numberOfElements--; - while (numberOfElements > 0) { - numberOfElements = numberOfElements >> 1; - n++; - } - return n; - } - - /** - * Aggregate information for each Covariate - */ - private static class RequiredCovariateInfo { - public final int nBits; // number of bits for this key - public final int offset; // the offset into the master key - public final long mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) - public final Covariate covariate; // this allows reverse lookup of the Covariates in order - - RequiredCovariateInfo(final int nBits, final int offset, final long mask, final Covariate covariate) { - this.nBits = nBits; - this.offset = offset; - this.mask = mask; - this.covariate = covariate; - } - } - - private static class OptionalCovariateInfo { - public final long covariateID; // cache the covariate ID (must be a long so that bit-fiddling works) - public final Covariate covariate; - - OptionalCovariateInfo(final long covariateID, final Covariate covariate) { - this.covariateID = covariateID; - this.covariate = covariate; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index fae2ac898..7da3c372e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -43,6 +45,19 @@ public class ContextCovariate implements StandardCovariate { private int mismatchesContextSize; private int indelsContextSize; + private int mismatchesKeyMask; + private int indelsKeyMask; + + private static final int LENGTH_BITS = 4; + private static final int LENGTH_MASK = 15; + + // temporary lists to use for creating context covariate keys + private final ArrayList mismatchKeys = new ArrayList(200); + private final ArrayList indelKeys = new ArrayList(200); + + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are + // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. + static final private int MAX_DNA_CONTEXT = 13; private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers @@ -59,11 +74,15 @@ public class ContextCovariate implements StandardCovariate { if (mismatchesContextSize <= 0 || indelsContextSize <= 0) throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); + + mismatchesKeyMask = createMask(mismatchesContextSize); + indelsKeyMask = createMask(indelsContextSize); } @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + // TODO -- wrong: fix me final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); @@ -71,10 +90,15 @@ public class ContextCovariate implements StandardCovariate { if (negativeStrand) bases = BaseUtils.simpleReverseComplement(bases); - final int readLength = clippedRead.getReadLength(); + mismatchKeys.clear(); + indelKeys.clear(); + contextWith(bases, mismatchesContextSize, mismatchKeys, mismatchesKeyMask); + contextWith(bases, indelsContextSize, indelKeys, indelsKeyMask); + + final int readLength = bases.length; for (int i = 0; i < readLength; i++) { - final long indelKey = contextWith(bases, i, indelsContextSize); - values.addCovariate(contextWith(bases, i, mismatchesContextSize), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); + final int indelKey = indelKeys.get(i); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); } } @@ -85,7 +109,7 @@ public class ContextCovariate implements StandardCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file return null; @@ -93,147 +117,126 @@ public class ContextCovariate implements StandardCovariate { } @Override - public long longFromKey(Object key) { - return keyFromContext((String) key); + public int keyFromValue(final Object value) { + return keyFromContext((String) value); } - @Override - public int numberOfBits() { - return Integer.bitCount(Integer.MAX_VALUE); + private static int createMask(final int contextSize) { + int mask = 0; + // create 2*contextSize worth of bits + for (int i = 0; i < contextSize; i++) + mask = (mask << 2) | 3; + // shift 4 bits to mask out the bits used to encode the length + return mask << LENGTH_BITS; } /** * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) * * @param bases the bases in the read to build the context from - * @param offset the position in the read to calculate the context for * @param contextSize context size to use building the context - * @return the key representing the context + * @param keys list to store the keys + * @param mask mask for pulling out just the context bits */ - private long contextWith(final byte[] bases, final int offset, final int contextSize) { - final int start = offset - contextSize + 1; - final long result; - if (start >= 0) - result = keyFromContext(bases, start, offset + 1); - else - result = -1L; - return result; + private static void contextWith(final byte[] bases, final int contextSize, final ArrayList keys, final int mask) { + + // the first contextSize-1 bases will not have enough previous context + for (int i = 1; i < contextSize && i <= bases.length; i++) + keys.add(-1); + + if (bases.length < contextSize) + return; + + final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; + + // get (and add) the key for the context starting at the first base + int currentKey = keyFromContext(bases, 0, contextSize); + keys.add(currentKey); + + // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects + int currentNPenalty = 0; + if (currentKey == -1) { + currentKey = 0; + currentNPenalty = contextSize - 1; + int offset = newBaseOffset; + while (bases[currentNPenalty] != 'N') { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); + currentKey |= (baseIndex << offset); + offset -= 2; + currentNPenalty--; + } + } + + final int readLength = bases.length; + for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); + if (baseIndex == -1) { // ignore non-ACGT bases + currentNPenalty = contextSize; + currentKey = 0; // reset the key + } else { + // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in + currentKey = (currentKey >> 2) & mask; + currentKey |= (baseIndex << newBaseOffset); + currentKey |= contextSize; + } + + if (currentNPenalty == 0) { + keys.add(currentKey); + } else { + currentNPenalty--; + keys.add(-1); + } + } } - public static long keyFromContext(final String dna) { + public static int keyFromContext(final String dna) { return keyFromContext(dna.getBytes(), 0, dna.length()); } /** - * Creates a long representation of a given dna string. + * Creates a int representation of a given dna string. * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. - * - * The bit representation of a dna string is the simple: - * 0 A 4 AA 8 CA - * 1 C 5 AC ... - * 2 G 6 AG 1343 TTGGT - * 3 T 7 AT 1364 TTTTT - * - * To convert from dna to number, we convert the dna string to base10 and add all combinations that - * preceded the string (with smaller lengths). - * - * @param dna the dna sequence + * @param dna the dna sequence + * @param start the start position in the byte array (inclusive) + * @param end the end position in the array (exclusive) * @return the key representing the dna sequence */ - public static long keyFromContext(final byte[] dna, final int start, final int end) { - final long preContext = combinationsPerLength[end - start - 1]; // the sum of all combinations that preceded the length of the dna string - long baseTen = 0L; // the number in base_10 that we are going to use to generate the bit set + private static int keyFromContext(final byte[] dna, final int start, final int end) { + + int key = end - start; + int bitOffset = 4; for (int i = start; i < end; i++) { - baseTen = (baseTen << 2); // multiply by 4 final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); if (baseIndex == -1) // ignore non-ACGT bases - return -1L; - baseTen += (long)baseIndex; + return -1; + key |= (baseIndex << bitOffset); + bitOffset += 2; } - return baseTen + preContext; // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. - } - - static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion. - static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length - static { - for (int i = 0; i < MAX_DNA_CONTEXT + 1; i++) - computeCombinationsFor(i); - } - - /** - * The sum of all combinations of a context of a given length from length = 0 to length. - * - * Memoized implementation of sum(4^i) , where i=[0,length] - * - * @param length the length of the DNA context - */ - private static void computeCombinationsFor(final int length) { - long combinations = 0L; - for (int i = 1; i <= length; i++) - combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) ) - combinationsPerLength[length] = combinations; + return key; } /** * Converts a key into the dna string representation. * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. - * - * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the - * base_10 representation of the sequence. This is important for us to know how to bring the number - * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented - * as 0's and leading 0's are omitted). - * - * quasi-canonical because A is represented by a 0, therefore, - * instead of : 0, 1, 2, 3, 10, 11, 12, ... - * we have : 0, 1, 2, 3, 00, 01, 02, ... - * - * but we can correctly decode it because we know the final length. - * * @param key the key representing the dna sequence * @return the dna sequence represented by the key */ - public static String contextFromKey(long key) { + public static String contextFromKey(final int key) { if (key < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - final int length = contextLengthFor(key); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) - key -= combinationsPerLength[length - 1]; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases + int offset = 4; StringBuilder dna = new StringBuilder(); - while (key > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) - final byte base = (byte) (key & 3); // equivalent to (key % 4) - dna.append((char)BaseUtils.baseIndexToSimpleBase(base)); - key = key >> 2; // divide by 4 + for (int i = 0; i < length; i++) { + final int baseIndex = (key & mask) >> offset; + dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); + mask = mask << 2; // move the mask over to the next 2 bits + offset += 2; } - for (int j = dna.length(); j < length; j++) - dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above) - return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along - } - - /** - * Calculates the length of the DNA context for a given base 10 number - * - * It is important to know the length given the base 10 number to calculate the number of combinations - * and to disambiguate the "quasi-canonical" state. - * - * This method also calculates the number of combinations as a by-product, but since it memoizes the - * results, a subsequent call to combinationsFor(length) is O(1). - * - * @param number the base 10 representation of the key - * @return the length of the DNA context represented by this number - */ - private static int contextLengthFor(final long number) { - int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. - long combinations = combinationsPerLength[length]; // the next context (we advance it so we know which one was preceding it). - while (combinations <= number) { // find the length of the dna string (length) - length++; - combinations = combinationsPerLength[length]; // calculate the next context - } - return length; + return dna.toString(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java index ff86220b8..4b959eea4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -67,7 +67,7 @@ public interface Covariate { * @param key the long representation of the key * @return a string representation of the key */ - public String formatKey(final long key); + public String formatKey(final int key); /** * Converts an Object key into a long key using only the lowest numberOfBits() bits @@ -75,18 +75,10 @@ public interface Covariate { * Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in long format. For counting covariates * the getValues method already returns all values in long format. * - * @param key the object corresponding to the covariate + * @param value the object corresponding to the covariate * @return a long representation of the object */ - public long longFromKey(final Object key); - - /** - * Each covariate should determine how many bits are necessary to encode it's data - * - * @return The number of bits used to represent the values of this covariate. - */ - public int numberOfBits(); - + public int keyFromValue(final Object value); } interface RequiredCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java index 3e91ca539..3c917388c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java @@ -79,7 +79,7 @@ public class CycleCovariate implements StandardCovariate { final int CUSHION = 4; final int MAX_CYCLE = readLength - CUSHION - 1; for (int i = 0; i < readLength; i++) { - final long key = (iMAX_CYCLE) ? -1L : keyFromCycle(cycle); + final int key = (iMAX_CYCLE) ? -1 : keyFromCycle(cycle); values.addCovariate(key, key, key, i); cycle += increment; } @@ -106,22 +106,22 @@ public class CycleCovariate implements StandardCovariate { int iii = 0; while (iii < readLength) { while (iii < readLength && bases[iii] == (byte) 'T') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'A') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'C') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'G') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } @@ -132,7 +132,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } @@ -143,22 +143,22 @@ public class CycleCovariate implements StandardCovariate { int iii = readLength - 1; while (iii >= 0) { while (iii >= 0 && bases[iii] == (byte) 'T') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'A') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'C') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'G') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } @@ -169,7 +169,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } @@ -190,26 +190,21 @@ public class CycleCovariate implements StandardCovariate { } @Override - public String formatKey(final long key) { - long cycle = key >> 1; // shift so we can remove the "sign" bit + public String formatKey(final int key) { + int cycle = key >> 1; // shift so we can remove the "sign" bit if ( (key & 1) != 0 ) // is the last bit set? cycle *= -1; // then the cycle is negative return String.format("%d", cycle); } @Override - public long longFromKey(final Object key) { - return (key instanceof String) ? keyFromCycle(Integer.parseInt((String) key)) : keyFromCycle((Integer) key); + public int keyFromValue(final Object value) { + return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); } - @Override - public int numberOfBits() { - return Integer.bitCount(Integer.MAX_VALUE); - } - - private static long keyFromCycle(final int cycle) { + private static int keyFromCycle(final int cycle) { // no negative values because values must fit into the first few bits of the long - long result = Math.abs(cycle); + int result = Math.abs(cycle); result = result << 1; // shift so we can add the "sign" bit if ( cycle < 0 ) result++; // negative cycles get the lower-most bit set diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java index c60ca38e1..8ee980124 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* @@ -49,7 +48,7 @@ public class QualityScoreCovariate implements RequiredCovariate { final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); for (int i = 0; i < baseQualities.length; i++) { - values.addCovariate((long)baseQualities[i], (long)baseInsertionQualities[i], (long)baseDeletionQualities[i], i); + values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); } } @@ -60,17 +59,12 @@ public class QualityScoreCovariate implements RequiredCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { return String.format("%d", key); } @Override - public long longFromKey(final Object key) { - return (key instanceof String) ? (long)Byte.parseByte((String) key) : (long)(Byte) key; + public int keyFromValue(final Object value) { + return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; } - - @Override - public int numberOfBits() { - return BQSRKeyManager.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE); - } -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java index 02339330b..541f3a0a5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java @@ -1,13 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.recalibration.QualQuantizer; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.util.Arrays; import java.util.List; -import java.util.Map; /** * Class that encapsulates the information necessary for quality score quantization for BQSR @@ -30,25 +31,17 @@ public class QuantizationInfo { this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); } - public QuantizationInfo(Map> keysAndTablesMap, int quantizationLevels) { + public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution for (int i = 0; i < qualHistogram.length; i++) qualHistogram[i] = 0L; - Map qualTable = null; // look for the quality score table - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = entry.getKey(); - if (keyManager.getNumRequiredCovariates() == 2) // it should be the only one with 2 required covariates - qualTable = entry.getValue(); - } + final NestedHashMap qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); // get the quality score table - if (qualTable == null) - throw new ReviewedStingException("Could not find QualityScore table."); - - for (RecalDatum datum : qualTable.values()) { - int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - long nObservations = datum.numObservations; - qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key + for (final Object value : qualTable.getAllValues()) { + final RecalDatum datum = (RecalDatum)value; + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += datum.numObservations; // add the number of observations for every key } empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java index c9043dc04..5e907237d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - /** * The object temporarily held by a read that describes all of it's covariates. * @@ -11,65 +9,56 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; * @since 2/8/12 */ public class ReadCovariates { - private final long[][] mismatchesKeySet; - private final long[][] insertionsKeySet; - private final long[][] deletionsKeySet; + private final int[][][] keys; private int currentCovariateIndex = 0; - public ReadCovariates(int readLength, int numberOfCovariates) { - this.mismatchesKeySet = new long[readLength][numberOfCovariates]; - this.insertionsKeySet = new long[readLength][numberOfCovariates]; - this.deletionsKeySet = new long[readLength][numberOfCovariates]; + public ReadCovariates(final int readLength, final int numberOfCovariates) { + keys = new int[EventType.values().length][readLength][numberOfCovariates]; } public void setCovariateIndex(final int index) { currentCovariateIndex = index; } - public void addCovariate(final long mismatch, final long insertion, final long deletion, final int readOffset) { - mismatchesKeySet[readOffset][currentCovariateIndex] = mismatch; - insertionsKeySet[readOffset][currentCovariateIndex] = insertion; - deletionsKeySet[readOffset][currentCovariateIndex] = deletion; + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; } - public long[] getKeySet(final int readPosition, final EventType errorModel) { - switch (errorModel) { - case BASE_SUBSTITUTION: - return getMismatchesKeySet(readPosition); - case BASE_INSERTION: - return getInsertionsKeySet(readPosition); - case BASE_DELETION: - return getDeletionsKeySet(readPosition); - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); - } + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.index][readPosition]; } - public long[] getMismatchesKeySet(final int readPosition) { - return mismatchesKeySet[readPosition]; + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.index]; } - public long[] getInsertionsKeySet(final int readPosition) { - return insertionsKeySet[readPosition]; + public int[] getMismatchesKeySet(final int readPosition) { + return keys[EventType.BASE_SUBSTITUTION.index][readPosition]; } - public long[] getDeletionsKeySet(final int readPosition) { - return deletionsKeySet[readPosition]; + public int[] getInsertionsKeySet(final int readPosition) { + return keys[EventType.BASE_INSERTION.index][readPosition]; + } + + public int[] getDeletionsKeySet(final int readPosition) { + return keys[EventType.BASE_DELETION.index][readPosition]; } /** * Testing routines */ - protected long[][] getMismatchesKeySet() { - return mismatchesKeySet; + protected int[][] getMismatchesKeySet() { + return keys[EventType.BASE_SUBSTITUTION.index]; } - protected long[][] getInsertionsKeySet() { - return insertionsKeySet; + protected int[][] getInsertionsKeySet() { + return keys[EventType.BASE_INSERTION.index]; } - protected long[][] getDeletionsKeySet() { - return deletionsKeySet; + protected int[][] getDeletionsKeySet() { + return keys[EventType.BASE_DELETION.index]; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index ae0ef38cc..c086ef6d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -40,9 +40,9 @@ import java.util.HashMap; public class ReadGroupCovariate implements RequiredCovariate { - private final HashMap readGroupLookupTable = new HashMap(); - private final HashMap readGroupReverseLookupTable = new HashMap(); - private long nextId = 0L; + private final HashMap readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private int nextId = 0; // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -51,7 +51,7 @@ public class ReadGroupCovariate implements RequiredCovariate { @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { final String readGroupId = readGroupValueFromRG(read.getReadGroup()); - final long key = keyForReadGroup(readGroupId); + final int key = keyForReadGroup(readGroupId); final int l = read.getReadLength(); for (int i = 0; i < l; i++) @@ -64,21 +64,16 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { return readGroupReverseLookupTable.get(key); } @Override - public long longFromKey(Object key) { - return keyForReadGroup((String) key); + public int keyFromValue(final Object value) { + return keyForReadGroup((String) value); } - @Override - public int numberOfBits() { - return BQSRKeyManager.numberOfBitsToRepresent(Short.MAX_VALUE); - } - - private long keyForReadGroup(final String readGroupId) { + private int keyForReadGroup(final String readGroupId) { if (!readGroupLookupTable.containsKey(readGroupId)) { readGroupLookupTable.put(readGroupId, nextId); readGroupReverseLookupTable.put(nextId, readGroupId); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index 1356ffa94..ec82da95f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -32,11 +32,13 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -82,6 +84,14 @@ public class RecalDataManager { private static final String SCRIPT_FILE = "BQSR.R"; + private static final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + public enum SOLID_RECAL_MODE { /** @@ -141,30 +151,6 @@ public class RecalDataManager { } } - - /** - * Initializes the recalibration table -> key manager map - * - * @param requiredCovariates list of required covariates (in order) - * @param optionalCovariates list of optional covariates (in order) - * @return a map with each key manager and it's corresponding recalibration table properly initialized - */ - public static LinkedHashMap> initializeTables(ArrayList requiredCovariates, ArrayList optionalCovariates) { - final LinkedHashMap> tablesAndKeysMap = new LinkedHashMap>(); - final ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. - final ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables - for (Covariate covariate : requiredCovariates) { - requiredCovariatesToAdd.add(covariate); - final Map recalTable = new HashMap(); // initializing a new recal table for each required covariate (cumulatively) - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager - tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map - } - final Map recalTable = new HashMap(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager - tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map - return tablesAndKeysMap; - } - /** * Generates two lists : required covariates and optional covariates based on the user's requests. * @@ -223,42 +209,29 @@ public class RecalDataManager { logger.info(""); } - private static List generateReportTables(Map> keysAndTablesMap) { + private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { List result = new LinkedList(); int tableIndex = 0; - final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); - final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); - final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); - final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - final BQSRKeyManager keyManager = entry.getKey(); - final Map recalTable = entry.getValue(); + for (final RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { - final boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs. - - final Covariate[] requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table - final Covariate[] optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table - - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - - for (final Covariate covariate : requiredList) { - final String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order - columnNames.add(new Pair(name, "%s")); // save the required covariate name so we can reference it in the future - } - - if (optionalList.length > 0) { - columnNames.add(covariateValue); - columnNames.add(covariateName); + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + if (type != RecalibrationTables.TableType.READ_GROUP_TABLE) { + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + if (type == RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } } columnNames.add(eventType); // the order of these column names is important here columnNames.add(empiricalQuality); - if (isReadGroupTable) + if (type == RecalibrationTables.TableType.READ_GROUP_TABLE) columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported columnNames.add(nObservations); columnNames.add(nErrors); @@ -269,42 +242,59 @@ public class RecalDataManager { int rowIndex = 0; - for (Map.Entry recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys - final Long bitSetKey = recalTableEntry.getKey(); - final Map columnData = new HashMap(columnNames.size()); - final Iterator> iterator = columnNames.iterator(); - for (final Object key : keyManager.keySetFrom(bitSetKey)) { - final String columnName = iterator.next().getFirst(); - columnData.put(columnName, key); - } - final RecalDatum datum = recalTableEntry.getValue(); - columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); - if (isReadGroupTable) - columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - columnData.put(iterator.next().getFirst(), datum.numObservations); - columnData.put(iterator.next().getFirst(), datum.numMismatches); + final NestedHashMap table = recalibrationTables.getTable(type); + for (final NestedHashMap.Leaf row : table.getAllLeaves()) { + final RecalDatum datum = (RecalDatum)row.value; + final List keys = row.keys; - for (final Map.Entry dataEntry : columnData.entrySet()) { - final String columnName = dataEntry.getKey(); - final Object value = dataEntry.getValue(); - reportTable.set(rowIndex, columnName, value.toString()); + int columnIndex = 0; + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), requestedCovariates[0].formatKey((Integer)keys.get(columnIndex++))); + if (type != RecalibrationTables.TableType.READ_GROUP_TABLE) { + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), requestedCovariates[1].formatKey((Integer) keys.get(columnIndex++))); + if (type == RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE) { + final int covariateIndex = (Integer)keys.get(columnIndex); + final Covariate covariate = requestedCovariates[2 + covariateIndex]; + final int covariateKey = (Integer)keys.get(columnIndex+1); + + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(covariateKey)); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); + } } + + final EventType event = EventType.eventFrom((Integer)keys.get(columnIndex)); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), event); + + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); + if (type == RecalibrationTables.TableType.READ_GROUP_TABLE) + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.numObservations); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), datum.numMismatches); + rowIndex++; } result.add(reportTable); } + return result; } - public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map> keysAndTablesMap, PrintStream outputFile) { - outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + private static String parseCovariateName(final Covariate covariate) { + return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, PrintStream outputFile) { - outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + private static void setReportTableCell(final GATKReportTable reportTable, final int rowIndex, final String columnName, final Object value) { + reportTable.set(rowIndex, columnName, value.toString()); } - private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List recalTables, PrintStream outputFile) { + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { + outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + } + + public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { + outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + } + + private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables, final PrintStream outputFile) { final GATKReport report = new GATKReport(); report.addTable(argumentTable); report.addTable(quantizationTable); @@ -340,108 +330,87 @@ public class RecalDataManager { } - public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, boolean keepIntermediates) { + public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) { final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), original, "ORIGINAL", true); + writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true); outputRecalibrationPlot(files, keepIntermediates); } - public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, LinkedHashMap> recalibrated, boolean keepIntermediates) { + public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) { final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true); - writeCSV(files.getFirst(), original, "ORIGINAL", false); + writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false); outputRecalibrationPlot(files, keepIntermediates); } - private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap> map, String recalibrationMode, boolean printHeader) { - final int QUALITY_SCORE_COVARIATE_INDEX = 1; - final Map deltaTable = new HashMap(); - BQSRKeyManager deltaKeyManager = null; + private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { + final NestedHashMap deltaTable = new NestedHashMap(); - - for (Map.Entry> tableEntry : map.entrySet()) { - final BQSRKeyManager keyManager = tableEntry.getKey(); - - if (keyManager.getNumOptionalCovariates() > 0) { // initialize with the 'all covariates' table - // create a key manager for the delta table - final List requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates()[0]); // include the read group covariate as the only required covariate - final List optionalCovariates = new ArrayList(); - optionalCovariates.add(keyManager.getRequiredCovariates()[1]); // include the quality score covariate as an optional covariate - optionalCovariates.addAll(Arrays.asList(keyManager.getOptionalCovariates())); // include all optional covariates - deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager - } + // add the quality score table to the delta table + final NestedHashMap qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + for (final NestedHashMap.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final List newCovs = new ArrayList(4); + newCovs.add(leaf.keys.get(0)); + newCovs.add(requestedCovariates.length); // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs.add(leaf.keys.get(1)); + newCovs.add(leaf.keys.get(2)); + addToDeltaTable(deltaTable, newCovs.toArray(), (RecalDatum)leaf.value); // add this covariate to the delta table } - if (deltaKeyManager == null) - throw new ReviewedStingException ("Couldn't find the covariates table"); - - boolean readyToPrint = false; - for (Map.Entry> tableEntry : map.entrySet()) { - final BQSRKeyManager keyManager = tableEntry.getKey(); - - if (keyManager.getNumRequiredCovariates() == 2 && keyManager.getNumOptionalCovariates() == 0) { // look for the QualityScore table - final Map table = tableEntry.getValue(); - - // add the quality score table to the delta table - for (final Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table - final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) - - final List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key - final List newCovs = new ArrayList(4); - newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score - newCovs.add(1, covs.get(1)); - newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) - newCovs.add(3, covs.get(2)); - final long deltaKey = deltaKeyManager.longFromKey(newCovs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table - } - } - - else if (keyManager.getNumOptionalCovariates() > 0) { // look for the optional covariates table - final Map table = tableEntry.getValue(); - - // add the optional covariates to the delta table - for (final Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table - final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) - - final List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key - covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) - final long deltaKey = deltaKeyManager.longFromKey(covs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table - } - readyToPrint = true; - } - - // output the csv file - if (readyToPrint) { - - if (printHeader) { - final List header = new LinkedList(); - header.add("ReadGroup"); - header.add("CovariateValue"); - header.add("CovariateName"); - header.add("EventType"); - header.add("Observations"); - header.add("Errors"); - header.add("EmpiricalQuality"); - header.add("AverageReportedQuality"); - header.add("Accuracy"); - header.add("Recalibration"); - deltaTableFile.println(Utils.join(",", header)); - } - - // print each data line - for (final Map.Entry deltaEntry : deltaTable.entrySet()) { - final List deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); - final RecalDatum deltaDatum = deltaEntry.getValue(); - deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.stringForCSV()); - deltaTableFile.println("," + recalibrationMode); - } - - } - + // add the optional covariates to the delta table + final NestedHashMap covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE); + for (final NestedHashMap.Leaf leaf : covTable.getAllLeaves()) { + final List covs = new ArrayList(leaf.keys); + covs.remove(1); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + addToDeltaTable(deltaTable, covs.toArray(), (RecalDatum)leaf.value); // add this covariate to the delta table } + + // output the csv file + if (printHeader) { + final List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + deltaTableFile.println(Utils.join(",", header)); + } + + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + // print each data line + for (final NestedHashMap.Leaf leaf : deltaTable.getAllLeaves()) { + final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); + final RecalDatum deltaDatum = (RecalDatum)leaf.value; + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + } + + private static List generateValuesFromKeys(final List keys, final Covariate[] covariates, final Map covariateNameMap) { + final List values = new ArrayList(4); + values.add(covariates[0].formatKey((Integer)keys.get(0))); + + // TODO -- create static final variables to hold the indexes of the RG, qual, cov ID, etc. + + final int covariateIndex = (Integer)keys.get(1); + final Covariate covariate = covariateIndex == covariates.length ? covariates[1] : covariates[2 + covariateIndex]; + final int covariateKey = (Integer)keys.get(2); + values.add(covariate.formatKey(covariateKey)); + values.add(covariateNameMap.get(covariate)); + + final EventType event = EventType.eventFrom((Integer)keys.get(3)); + values.add(event); + + return values; } /** @@ -453,15 +422,14 @@ public class RecalDataManager { * @param deltaKey the key to the table * @param recalDatum the recal datum to combine with the accuracyDatum element in the table */ - private static void addToDeltaTable(Map deltaTable, Long deltaKey, RecalDatum recalDatum) { - final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + private static void addToDeltaTable(final NestedHashMap deltaTable, final Object[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); // if we don't have a key yet, create a new one with the same values as the curent datum else deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } - /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string * @@ -627,13 +595,13 @@ public class RecalDataManager { * * @param read The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. - * @param readCovariates The object to store the covariate values + * @param resultsStorage The object to store the covariate values */ - public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates readCovariates) { + public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read for (int i = 0; i < requestedCovariates.length; i++) { - readCovariates.setCovariateIndex(i); - requestedCovariates[i].recordValues(read, readCovariates); + resultsStorage.setCovariateIndex(i); + requestedCovariates[i].recordValues(read, resultsStorage); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 3eb3a3981..b26912c31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -113,8 +113,7 @@ public class RecalDatum extends Datum { return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); } - - private double calcExpectedErrors() { + private double calcExpectedErrors() { return (double) this.numObservations * qualToErrorProb(estimatedQReported); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index 5af15c01c..a7088f4b6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -3,8 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.io.File; import java.io.PrintStream; @@ -18,14 +19,19 @@ import java.util.*; */ public class RecalibrationReport { private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager + private final RecalibrationTables recalibrationTables; // quick access reference to the tables private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final HashMap optionalCovariateIndexes; private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + private final Object[] tempRGarray = new Object[2]; + private final Object[] tempQUALarray = new Object[3]; + private final Object[] tempCOVarray = new Object[5]; + public RecalibrationReport(final File RECAL_FILE) { - GATKReport report = new GATKReport(RECAL_FILE); + final GATKReport report = new GATKReport(RECAL_FILE); argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE); RAC = initializeArgumentCollectionTable(argumentTable); @@ -37,52 +43,39 @@ public class RecalibrationReport { ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + optionalCovariateIndexes = new HashMap(optionalCovariates.size()); int covariateIndex = 0; for (final Covariate covariate : requiredCovariates) requestedCovariates[covariateIndex++] = covariate; - for (final Covariate covariate : optionalCovariates) - requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) { + requestedCovariates[covariateIndex] = covariate; + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + optionalCovariateIndexes.put(covariateName, covariateIndex-2); + covariateIndex++; + } for (Covariate cov : requestedCovariates) cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - keysAndTablesMap = new LinkedHashMap>(); - ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. - ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables - for (Covariate covariate : requiredCovariates) { - requiredCovariatesToAdd.add(covariate); - final Map table; // initializing a new recal table for each required covariate (cumulatively) - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager + final GATKReportTable rgReportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); + final NestedHashMap rgTable = parseReadGroupTable(rgReportTable); - final int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) - final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; - if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table - final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); - table = parseReadGroupTable(keyManager, reportTable); - } - else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table - final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE); - table = parseQualityScoreTable(keyManager, reportTable); - } - else - throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION); + final GATKReportTable qualReportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE); + final NestedHashMap qualTable = parseQualityScoreTable(qualReportTable); - keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map - } + final GATKReportTable covReportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); + final NestedHashMap covTable = parseAllCovariatesTable(covReportTable); - - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager - final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); - final Map table = parseAllCovariatesTable(keyManager, reportTable); - keysAndTablesMap.put(keyManager, table); + recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); } - protected RecalibrationReport(final QuantizationInfo quantizationInfo, final LinkedHashMap> keysAndTablesMap, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { + protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { this.quantizationInfo = quantizationInfo; - this.keysAndTablesMap = keysAndTablesMap; + this.recalibrationTables = recalibrationTables; this.argumentTable = argumentTable; this.RAC = RAC; this.requestedCovariates = null; + this.optionalCovariateIndexes = null; } /** @@ -98,29 +91,20 @@ public class RecalibrationReport { * * @param other the recalibration report to combine with this one */ - public void combine(RecalibrationReport other) { - Iterator>> thisIterator = keysAndTablesMap.entrySet().iterator(); + public void combine(final RecalibrationReport other) { - for (Map.Entry> otherEntry : other.getKeysAndTablesMap().entrySet()) { - Map.Entry> thisEntry = thisIterator.next(); + for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { + final NestedHashMap myTable = recalibrationTables.getTable(type); + final NestedHashMap otherTable = other.recalibrationTables.getTable(type); - final Map thisTable = thisEntry.getValue(); - final BQSRKeyManager thisKeyManager = thisEntry.getKey(); - final BQSRKeyManager otherKeyManager = otherEntry.getKey(); + for (final NestedHashMap.Leaf row : otherTable.getAllLeaves()) { + final RecalDatum myDatum = (RecalDatum)myTable.get(row.keys); - for (Map.Entry otherTableEntry : otherEntry.getValue().entrySet()) { - final RecalDatum otherDatum = otherTableEntry.getValue(); - final Long otherBitKey = otherTableEntry.getKey(); - final List otherObjectKey = otherKeyManager.keySetFrom(otherBitKey); - - final long thisKey = thisKeyManager.longFromKey(otherObjectKey.toArray()); - final RecalDatum thisDatum = thisTable.get(thisKey); - - if (thisDatum == null) - thisTable.put(thisKey, otherDatum); + if (myDatum == null) + myTable.put(row.value, row.keys); else - thisDatum.combine(otherDatum); - } + myDatum.combine((RecalDatum)row.value); + } } } @@ -128,8 +112,8 @@ public class RecalibrationReport { return quantizationInfo; } - public LinkedHashMap> getKeysAndTablesMap() { - return keysAndTablesMap; + public RecalibrationTables getRecalibrationTables() { + return recalibrationTables; } public Covariate[] getRequestedCovariates() { @@ -139,82 +123,87 @@ public class RecalibrationReport { /** * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table * - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(5); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + private NestedHashMap parseAllCovariatesTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); + + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); + final String covName = (String)reportTable.get(i, RecalDataManager.COVARIATE_NAME_COLUMN_NAME); + final int covIndex = optionalCovariateIndexes.get(covName); + tempCOVarray[2] = covIndex; + final Object covValue = reportTable.get(i, RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); + tempCOVarray[3] = requestedCovariates[covIndex + 2].keyFromValue(covValue); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempCOVarray[4] = event.index; + + result.put(getRecalDatum(reportTable, i, false), tempCOVarray); + } + + return result; } /** * * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(3); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + private NestedHashMap parseQualityScoreTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); + + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempQUALarray[2] = event.index; + + result.put(getRecalDatum(reportTable, i, false), tempQUALarray); + } + + return result; } /** * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table * - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(2); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true); - } - - /** - * Shared parsing functionality for all tables. - * - * @param keyManager the key manager for this table - * @param reportTable the GATKReport table containing data for this table - * @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table - * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. - */ - private Map genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList columnNamesOrderedList, boolean hasEstimatedQReportedColumn) { - final Map result = new HashMap(reportTable.getNumRows()*2); + private NestedHashMap parseReadGroupTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final int nKeys = columnNamesOrderedList.size(); - final Object [] keySet = new Object[nKeys]; - for (int j = 0; j < nKeys; j++) - keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below) - keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager). - final long bitKey = keyManager.longFromKey(keySet); + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempRGarray[1] = event.index; - final long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); - final long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); - final double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); - - final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table - (Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table - - final RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); - result.put(bitKey, recalDatum); + result.put(getRecalDatum(reportTable, i, true), tempRGarray); } + return result; } + private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { + final long nObservations = (Long) reportTable.get(row, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); + final long nErrors = (Long) reportTable.get(row, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); + final double empiricalQuality = (Double) reportTable.get(row, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); + + final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table + (Double) reportTable.get(row, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + + return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); + } + /** * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores * @@ -308,55 +297,21 @@ public class RecalibrationReport { * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. */ public void calculateEmpiricalAndQuantizedQualities() { - for (Map table : keysAndTablesMap.values()) - for (RecalDatum datum : table.values()) - datum.calcCombinedEmpiricalQuality(); + for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { + final NestedHashMap table = recalibrationTables.getTable(type); + for (final Object value : table.getAllValues()) { + ((RecalDatum)value).calcCombinedEmpiricalQuality(); + } + } - quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS); + quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); } public void output(PrintStream output) { - RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output); + RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output); } public RecalibrationArgumentCollection getRAC() { return RAC; } - - @Override - public boolean equals(Object o) { - if (!(o instanceof RecalibrationReport)) - return false; - RecalibrationReport other = (RecalibrationReport) o; - if (this == o) - return true; - return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap); - } - - private boolean isEqualTable(LinkedHashMap> t1, LinkedHashMap> t2) { - if (t1.size() != t2.size()) - return false; - - final Iterator>> t1Iterator = t1.entrySet().iterator(); - final Iterator>> t2Iterator = t2.entrySet().iterator(); - - while (t1Iterator.hasNext() && t2Iterator.hasNext()) { - Map.Entry> t1MapEntry = t1Iterator.next(); - Map.Entry> t2MapEntry = t2Iterator.next(); - - if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey()))) - return false; - - final Map table2 = t2MapEntry.getValue(); - for (Map.Entry t1TableEntry : t1MapEntry.getValue().entrySet()) { - final Long t1Key = t1TableEntry.getKey(); - if (!table2.containsKey(t1Key)) - return false; - final RecalDatum t1Datum = t1TableEntry.getValue(); - if (!t1Datum.equals(table2.get(t1Key))) - return false; - } - } - return true; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index cba38d0de..369731530 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -266,13 +266,13 @@ public class DiagnoseTargets extends LocusWalker { alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); - VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF - vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds)))); + vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); + attributes.put(ThresHolder.AVG_INTERVAL_DP_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); if (debug) { @@ -282,7 +282,7 @@ public class DiagnoseTargets extends LocusWalker { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStatistics sampleStat = stats.getSample(sample); - gb.DP((int)sampleStat.averageCoverage()); + gb.attribute(ThresHolder.AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); gb.attribute("Q1", sampleStat.getQuantileDepth(0.25)); gb.attribute("MED", sampleStat.getQuantileDepth(0.50)); gb.attribute("Q3", sampleStat.getQuantileDepth(0.75)); @@ -290,7 +290,7 @@ public class DiagnoseTargets extends LocusWalker { if (debug) { System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads()); } - gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds))); + gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds), false)); genotypes.add(gb.make()); } @@ -307,11 +307,12 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusesToStrings(Set statuses) { + private List statusesToStrings(Set statuses, final boolean includePASS) { List output = new ArrayList(statuses.size()); for (CallableStatus status : statuses) - output.add(status.name()); + if ( includePASS || status != CallableStatus.PASS ) // adding pass => results in a filter for genotypes + output.add(status.name()); return output; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index 234906944..0d8195551 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -31,6 +31,7 @@ import java.util.HashSet; import java.util.Set; class ThresHolder { + public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP"; public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); private final int minimumBaseQuality; @@ -129,12 +130,13 @@ class ThresHolder { // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - headerLines.add(new VCFInfoHeaderLine("AVG_INTERVAL_DP", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype // todo -- find the appropriate VCF constants - headerLines.add(new VCFFormatHeaderLine("AVG_INTERVAL_DP", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index df5f5adf1..b4cf96831 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -64,9 +64,10 @@ public class VCFDiffableReader implements DiffableReader { root.add("VERSION", version); br.close(); - // must be read as state is stored in reader itself - AbstractVCFCodec.disableOnTheFlyModifications(); - FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + final VCFCodec vcfCodec = new VCFCodec(); + vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself + + FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false); VCFHeader header = (VCFHeader)reader.getHeader(); for ( VCFHeaderLine headerLine : header.getMetaData() ) { String key = headerLine.getKey(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8f3b0ea07..71352bddd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -176,7 +176,7 @@ public class VariantFiltrationWalker extends RodWalker { hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); if ( genotypeFilterExps.size() > 0 ) - hInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); + hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); if ( mask.isBound() ) { hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 5b9a83a1b..29ca1265c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // the annotation engine private VariantAnnotatorEngine annotationEngine; + private Set samples; + // enable deletions in the pileup @Override public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode"); // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // initialize the verbose writer if ( verboseWriter != null ) @@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * @return the VariantCallContext object */ public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); + return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples); } public UGStatistics reduceInit() { return new UGStatistics(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5cc74e729..ecaf9df6a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine { } /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object + * same as the full call but with allSamples == null + * + * @param tracker + * @param refContext + * @param rawContext + * @return */ - public List calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); + } + + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Set allSamples) { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); @@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine { } } - return results; + return addMissingSamples(results, allSamples); + } + + private List addMissingSamples(final List calls, final Set allSamples) { + if ( calls.isEmpty() || allSamples == null ) return calls; + + final List withAllSamples = new ArrayList(calls.size()); + for ( final VariantCallContext call : calls ) { + if ( call == null ) + withAllSamples.add(call); + else { + final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); + withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); + } + } + + return withAllSamples; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index 22c0131c2..2e3fc26f6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker samples; public static class CountedData { private long nAltCalledAlt = 0L; @@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); - Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 6a55b024b..629c7f84c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker { /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ private boolean sitesOnlyVCF = false; + private Set samples; public void initialize() { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); + if ( vcfWriter instanceof VariantContextWriterStub) { + sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + if ( PRIORITY_STRING == null ) { PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); } validateAnnotateUnionArguments(); - Set samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption); + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) SET_KEY = null; @@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker { headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); if ( !ASSUME_IDENTICAL_SAMPLES ) headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions)); - VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + VCFHeader vcfHeader = new VCFHeader(headerLines, samples); vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); vcfWriter.writeHeader(vcfHeader); - - if ( vcfWriter instanceof VariantContextWriterStub) { - sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); - if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); - } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); } private void validateAnnotateUnionArguments() { @@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker { VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 33ab5a4c3..fbffd620a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -510,7 +510,7 @@ public class SelectVariants extends RodWalker implements TreeR for (VariantContext vc : vcs) { // an option for performance testing only if ( fullyDecode ) - vc = vc.fullyDecode(vcfRods.get(vc.getSource())); + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() ); // an option for performance testing only if ( forceGenotypesDecode ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index b508a9dd5..e8c6794f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker { private Set allowedGenotypeFormatStrings = new HashSet(); private boolean wroteHeader = false; + private Set samples; // for dealing with indels in hapmap CloseableIterator dbsnpIterator = null; @@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker { } } - Set samples = new LinkedHashSet(); + samples = new LinkedHashSet(); if ( sampleName != null ) { samples.add(sampleName); } else { @@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); + vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 3871ca987..393dd5735 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import java.util.Arrays; + /** * BaseUtils contains some basic utilities for manipulating nucleotides. */ @@ -47,6 +49,20 @@ public class BaseUtils { public boolean sameBase(int i) { return index == i; } } + static private final int[] baseIndexMap = new int[256]; + static { + Arrays.fill(baseIndexMap, -1); + baseIndexMap['A'] = 0; + baseIndexMap['a'] = 0; + baseIndexMap['*'] = 0; // the wildcard character counts as an A + baseIndexMap['C'] = 1; + baseIndexMap['c'] = 1; + baseIndexMap['G'] = 2; + baseIndexMap['g'] = 2; + baseIndexMap['T'] = 3; + baseIndexMap['t'] = 3; + } + // todo -- fix me (enums?) public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') @@ -182,27 +198,7 @@ public class BaseUtils { * @return 0, 1, 2, 3, or -1 if the base can't be understood */ static public int simpleBaseToBaseIndex(byte base) { - switch (base) { - case '*': // the wildcard character counts as an A - case 'A': - case 'a': - return 0; - - case 'C': - case 'c': - return 1; - - case 'G': - case 'g': - return 2; - - case 'T': - case 't': - return 3; - - default: - return -1; - } + return baseIndexMap[base]; } /** @@ -213,27 +209,7 @@ public class BaseUtils { */ @Deprecated static public int simpleBaseToBaseIndex(char base) { - switch (base) { - case '*': // the wildcard character counts as an A - case 'A': - case 'a': - return 0; - - case 'C': - case 'c': - return 1; - - case 'G': - case 'g': - return 2; - - case 'T': - case 't': - return 3; - - default: - return -1; - } + return baseIndexMap[base]; } static public int extendedBaseToBaseIndex(byte base) { @@ -284,11 +260,6 @@ public class BaseUtils { } } - @Deprecated - static public char baseIndexToSimpleBaseAsChar(int baseIndex) { - return (char) baseIndexToSimpleBase(baseIndex); - } - /** * Converts a base index to a base index representing its cross-talk partner * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 91331ac13..94c24b097 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -101,15 +101,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD @Override public Feature decodeLoc( final PositionalBufferedStream inputStream ) { - recordNo++; - final VariantContextBuilder builder = new VariantContextBuilder(); - - final int sitesBlockSize = decoder.readBlockSize(inputStream); - final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream - decoder.readNextBlock(sitesBlockSize, inputStream); - decodeSiteLoc(builder); - - return builder.fullyDecoded(true).make(); + return decode(inputStream); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 1bb833868..7a6d96131 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -136,6 +136,10 @@ public final class BCF2Decoder { public final Object decodeTypedValue(final byte typeDescriptor) { final int size = decodeNumberOfElements(typeDescriptor); + return decodeTypedValue(typeDescriptor, size); + } + + public final Object decodeTypedValue(final byte typeDescriptor, final int size) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); assert size >= 0; @@ -285,8 +289,7 @@ public final class BCF2Decoder { } } - public final int[] decodeIntArray(final byte typeDescriptor) { - final int size = decodeNumberOfElements(typeDescriptor); + public final int[] decodeIntArray(final byte typeDescriptor, final int size) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); return decodeIntArray(size, type, null); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index 5a4d1d0da..59537a329 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -104,19 +104,17 @@ public class BCF2GenotypeFieldDecoders { final String field, final BCF2Decoder decoder, final byte typeDescriptor, + final int numElements, final GenotypeBuilder[] gbs); } private class GTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { - // we have to do a bit of low-level processing here as we want to know the size upfronta - final int ploidy = decoder.decodeNumberOfElements(typeDescriptor); - - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { + if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { - generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs); + generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); } } @@ -218,7 +216,7 @@ public class BCF2GenotypeFieldDecoders { private class DPDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); @@ -228,7 +226,7 @@ public class BCF2GenotypeFieldDecoders { private class GQDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); @@ -238,27 +236,27 @@ public class BCF2GenotypeFieldDecoders { private class ADDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - gb.AD(decoder.decodeIntArray(typeDescriptor)); + gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class PLDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - gb.PL(decoder.decodeIntArray(typeDescriptor)); + gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class GenericDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor); + Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if ( value != null ) { // don't add missing values if ( value instanceof List && ((List)value).size() == 1) { // todo -- I really hate this, and it suggests that the code isn't completely right @@ -275,9 +273,9 @@ public class BCF2GenotypeFieldDecoders { private class FTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor); + Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if ( value != null ) { // don't add missing values gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List)value); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 7f10375bb..c749325fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -77,9 +77,10 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { // the type of each element final byte typeDescriptor = decoder.readTypeDescriptor(); + final int numElements = decoder.decodeNumberOfElements(typeDescriptor); final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field); try { - fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders); + fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, numElements, builders); } catch ( ClassCastException e ) { throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field + " inconsistent with the value observed in the decoded value"); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 2669206fe..21deb4158 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -32,10 +32,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.*; import java.util.*; /** @@ -200,17 +197,35 @@ public final class BCF2Utils { * foo.vcf => foo.bcf * foo.xxx => foo.xxx.bcf * + * If the resulting BCF file cannot be written, return null. Happens + * when vcfFile = /dev/null for example + * * @param vcfFile - * @return + * @return the BCF */ @Requires("vcfFile != null") - @Ensures("result != null") public static final File shadowBCF(final File vcfFile) { final String path = vcfFile.getAbsolutePath(); if ( path.contains(".vcf") ) return new File(path.replace(".vcf", ".bcf")); - else - return new File( path + ".bcf" ); + else { + final File bcf = new File( path + ".bcf" ); + if ( bcf.canRead() ) + return bcf; + else { + try { + // this is the only way to robustly decide if we could actually write to BCF + final FileOutputStream o = new FileOutputStream(bcf); + o.close(); + bcf.delete(); + return bcf; + } catch ( FileNotFoundException e ) { + return null; + } catch ( IOException e ) { + return null; + } + } + } } @Ensures("BCF2Type.INTEGERS.contains(result)") diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 77aed0e0b..f9f310538 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -22,7 +22,6 @@ import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); - protected static boolean doOnTheFlyModifications = true; protected final static Logger log = Logger.getLogger(AbstractVCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -61,6 +60,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected boolean warnedAboutNoEqualsForNonFlag = false; + /** + * If true, then we'll magically fix up VCF headers on the fly when we read them in + */ + protected boolean doOnTheFlyModifications = true; + protected AbstractVCFCodec() { super(VariantContext.class); } @@ -850,7 +854,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * of VCF records. Useful primarily for raw comparisons such as when comparing * raw VCF records */ - public static final void disableOnTheFlyModifications() { + public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 97f3ecd0c..667de3dea 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -28,6 +28,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.LinkedHashMap; @@ -67,17 +69,30 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF return count; } - // utility method - public int getCount(int numAltAlleles) { - int myCount; + /** + * Get the number of values expected for this header field, given the properties of VariantContext vc + * + * If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 + * If the count is of type A, return vc.getNAlleles - 1 + * If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the + * max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information + * at all, then implicitly assume diploid samples when computing G values. + * If the count is UNBOUNDED return -1 + * + * @param vc + * @return + */ + public int getCount(final VariantContext vc) { switch ( countType ) { - case INTEGER: myCount = count; break; - case UNBOUNDED: myCount = -1; break; - case A: myCount = numAltAlleles; break; - case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; - default: throw new ReviewedStingException("Unknown count type: " + countType); + case INTEGER: return count; + case UNBOUNDED: return -1; + case A: return vc.getNAlleles() - 1; + case G: + final int ploidy = vc.getMaxPloidy(); + return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy); + default: + throw new ReviewedStingException("Unknown count type: " + countType); } - return myCount; } public void setNumberToUnbounded() { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java index 84c60d9d1..dcc141b00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java @@ -183,6 +183,7 @@ public class VCFStandardHeaderLines { registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); // INFO lines registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java index 8652d3c28..6e79b7f24 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java @@ -25,7 +25,9 @@ package org.broadinstitute.sting.utils.collections; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -83,4 +85,53 @@ public class NestedHashMap { return value; // todo -- should never reach this point } + + public List getAllValues() { + List result = new ArrayList(); + fillAllValues(data, result); + return result; + } + + private void fillAllValues(final Map map, final List result) { + for ( Object value : map.values() ) { + if ( value == null ) + continue; + if ( value instanceof Map ) + fillAllValues((Map)value, result); + else + result.add(value); + } + } + + public static class Leaf { + public final List keys; + public final Object value; + + public Leaf(final List keys, final Object value) { + this.keys = keys; + this.value = value; + } + } + + public List getAllLeaves() { + List result = new ArrayList(); + List path = new ArrayList(); + fillAllLeaves(data, path, result); + return result; + } + + private void fillAllLeaves(final Map map, final List path, final List result) { + for ( final Object key : map.keySet() ) { + final Object value = map.get(key); + if ( value == null ) + continue; + final List newPath = new ArrayList(path); + newPath.add(key); + if ( value instanceof Map ) { + fillAllLeaves((Map) value, newPath, result); + } else { + result.add(new Leaf(newPath, value)); + } + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c96226405..6ee4af288 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -6,6 +6,8 @@ import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -169,21 +171,23 @@ public class IntervalUtils { */ public static List mergeListsBySetOperator(List setOne, List setTwo, IntervalSetRule rule) { // shortcut, if either set is zero, return the other set - if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) return (setOne == null || setOne.size() == 0) ? setTwo : setOne; + if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) + return Collections.unmodifiableList((setOne == null || setOne.size() == 0) ? setTwo : setOne); + + // our master list, since we can't guarantee removal time in a generic list + LinkedList retList = new LinkedList(); // if we're set to UNION, just add them all - if (rule == IntervalSetRule.UNION) { - setOne.addAll(setTwo); - return setOne; + if (rule == null || rule == IntervalSetRule.UNION) { + retList.addAll(setOne); + retList.addAll(setTwo); + return Collections.unmodifiableList(retList); } // else we're INTERSECTION, create two indexes into the lists int iOne = 0; int iTwo = 0; - // our master list, since we can't guarantee removal time in a generic list - LinkedList retList = new LinkedList(); - // merge the second into the first using the rule while (iTwo < setTwo.size() && iOne < setOne.size()) // if the first list is ahead, drop items off the second until we overlap @@ -204,7 +208,7 @@ public class IntervalUtils { throw new UserException.BadInput("The INTERSECTION of your -L options produced no intervals."); // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have - return retList; + return Collections.unmodifiableList(retList); } /** @@ -218,6 +222,8 @@ public class IntervalUtils { * @return A sorted, merged version of the intervals passed in. */ public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocParser parser, List intervals, IntervalMergingRule mergingRule) { + // Make a copy of the (potentially unmodifiable) list to be sorted + intervals = new ArrayList(intervals); // sort raw interval list Collections.sort(intervals); // now merge raw interval list @@ -481,6 +487,70 @@ public class IntervalUtils { return new SplitLocusRecursive(split, remaining); } + /** + * Setup the intervals to be processed + */ + public static GenomeLocSortedSet parseIntervalBindings( + final ReferenceDataSource referenceDataSource, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + + Pair includeExcludePair = parseIntervalBindingsPair( + referenceDataSource, intervals, intervalSetRule, intervalMergingRule, intervalPadding, excludeIntervals); + + GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + if (excludeSortedSet != null) { + return includeSortedSet.subtractRegions(excludeSortedSet); + } else { + return includeSortedSet; + } + } + + public static Pair parseIntervalBindingsPair( + final ReferenceDataSource referenceDataSource, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + GenomeLocParser genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = ((intervals == null || intervals.size() == 0) ? + GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(intervals, intervalSetRule, intervalMergingRule, intervalPadding, genomeLocParser)); + + GenomeLocSortedSet excludeSortedSet = null; + if (excludeIntervals != null && excludeIntervals.size() > 0) { + excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, 0, genomeLocParser); + } + return new Pair(includeSortedSet, excludeSortedSet); + } + + public static GenomeLocSortedSet loadIntervals( + final List> intervalBindings, + final IntervalSetRule rule, final IntervalMergingRule intervalMergingRule, final int padding, + final GenomeLocParser genomeLocParser) { + List allIntervals = new ArrayList(); + for ( IntervalBinding intervalBinding : intervalBindings) { + @SuppressWarnings("unchecked") + List intervals = intervalBinding.getIntervals(genomeLocParser); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + if ( padding > 0 ) { + intervals = getIntervalsWithFlanks(genomeLocParser, intervals, padding); + } + + allIntervals = mergeListsBySetOperator(intervals, allIntervals, rule); + } + + return sortAndMergeIntervals(genomeLocParser, allIntervals, intervalMergingRule); + } + private final static class SplitLocusRecursive { final List split; final LinkedList remaining; @@ -546,7 +616,7 @@ public class IntervalUtils { */ public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { if (raw.size() <= 1) - return raw; + return Collections.unmodifiableList(raw); else { ArrayList merged = new ArrayList(); Iterator it = raw.iterator(); @@ -555,7 +625,7 @@ public class IntervalUtils { GenomeLoc curr = it.next(); if (prev.overlapsP(curr)) { prev = prev.merge(curr); - } else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) { + } else if (prev.contiguousP(curr) && (rule == null || rule == IntervalMergingRule.ALL)) { prev = prev.merge(curr); } else { merged.add(prev); @@ -563,7 +633,7 @@ public class IntervalUtils { } } merged.add(prev); - return merged; + return Collections.unmodifiableList(merged); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 631d69858..3612693da 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -28,10 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.*; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.util.*; /** * Utility methods to facilitate on-the-fly base quality score recalibration. @@ -45,39 +45,15 @@ public class BaseRecalibration { private final ReadCovariates readCovariates; private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) - private final KeysAndTables keysAndTables; + private final RecalibrationTables recalibrationTables; private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - static class KeysAndTables { + private final Object[] tempKeySet; - public enum Type { - READ_GROUP_TABLE(0), - QUALITY_SCORE_TABLE(1), - OPTIONAL_COVARIATE_TABLE(2); - - private final int index; - - private Type(int index) { - this.index = index; - } - } - - public final BQSRKeyManager[] managers = new BQSRKeyManager[Type.values().length]; - public final Map[] tables = new Map[Type.values().length]; - - public KeysAndTables(final Map> keysAndTablesMap) { - for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { - Type type; - if (mapEntry.getKey().getNumRequiredCovariates() == 1) - type = Type.READ_GROUP_TABLE; - else if (mapEntry.getKey().getNumOptionalCovariates() == 0) - type = Type.QUALITY_SCORE_TABLE; - else - type = Type.OPTIONAL_COVARIATE_TABLE; - managers[type.index] = mapEntry.getKey(); - tables[type.index] = mapEntry.getValue(); - } - } + private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + static { + for (int i = 0; i < EventType.values().length; i++) + qualityScoreByFullCovariateKey[i] = new NestedHashMap(); } /** @@ -89,7 +65,7 @@ public class BaseRecalibration { public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - keysAndTables = new KeysAndTables(recalibrationReport.getKeysAndTablesMap()); + recalibrationTables = recalibrationReport.getRecalibrationTables(); requestedCovariates = recalibrationReport.getRequestedCovariates(); quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores @@ -98,20 +74,22 @@ public class BaseRecalibration { quantizationInfo.quantizeQualityScores(quantizationLevels); readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + tempKeySet = new Integer[requestedCovariates.length]; } /** * This constructor only exists for testing purposes. * * @param quantizationInfo the quantization info object - * @param keysAndTablesMap the map of key managers and recalibration tables + * @param recalibrationTables the map of key managers and recalibration tables * @param requestedCovariates the list of requested covariates */ - protected BaseRecalibration(final QuantizationInfo quantizationInfo, final LinkedHashMap> keysAndTablesMap, final Covariate[] requestedCovariates) { + protected BaseRecalibration(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { this.quantizationInfo = quantizationInfo; - keysAndTables = new KeysAndTables(keysAndTablesMap); + this.recalibrationTables = recalibrationTables; this.requestedCovariates = requestedCovariates; readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + tempKeySet = new Integer[requestedCovariates.length]; } /** @@ -125,13 +103,20 @@ public class BaseRecalibration { RecalDataManager.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings final byte[] quals = read.getBaseQualities(errorModel); + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model + + final int readLength = read.getReadLength(); + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read - for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read final byte originalQualityScore = quals[offset]; if (originalQualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - final long[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model - final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model + Byte recalibratedQualityScore = (Byte) qualityScoreByFullCovariateKey[errorModel.index].get(wrapKeySet(keySet)); + if (recalibratedQualityScore == null) { + recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + qualityScoreByFullCovariateKey[errorModel.index].put(recalibratedQualityScore, keySet); + } quals[offset] = recalibratedQualityScore; } } @@ -139,7 +124,11 @@ public class BaseRecalibration { } } - + private Object[] wrapKeySet(final int[] keySet) { + for (int i = 0; i < keySet.length; i++) + tempKeySet[i] = keySet[i]; + return tempKeySet; + } /** * Implements a serial recalibration of the reads using the combinational table. @@ -158,24 +147,23 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - protected byte performSequentialQualityCalculation(final long[] key, final EventType errorModel) { + protected byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { - final double globalDeltaQ = calculateGlobalDeltaQ(keysAndTables.managers[KeysAndTables.Type.READ_GROUP_TABLE.index], keysAndTables.tables[KeysAndTables.Type.READ_GROUP_TABLE.index], key, errorModel); - final double deltaQReported = calculateDeltaQReported(keysAndTables.managers[KeysAndTables.Type.QUALITY_SCORE_TABLE.index], keysAndTables.tables[KeysAndTables.Type.QUALITY_SCORE_TABLE.index], key, errorModel, globalDeltaQ); - final double deltaQCovariates = calculateDeltaQCovariates(keysAndTables.managers[KeysAndTables.Type.OPTIONAL_COVARIATE_TABLE.index], keysAndTables.tables[KeysAndTables.Type.OPTIONAL_COVARIATE_TABLE.index], key, errorModel, globalDeltaQ, deltaQReported); + final byte qualFromRead = (byte)(long)key[1]; + final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel); + final double deltaQReported = calculateDeltaQReported(recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE), key, errorModel, globalDeltaQ, qualFromRead); + final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE), key, errorModel, globalDeltaQ, deltaQReported, qualFromRead); - final byte qualFromRead = (byte)key[1]; double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } - private double calculateGlobalDeltaQ(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel) { + private double calculateGlobalDeltaQ(final NestedHashMap table, final int[] key, final EventType errorModel) { double result = 0.0; - final long masterKey = keyManager.createMasterKey(key, errorModel, -1); - final RecalDatum empiricalQualRG = table.get(masterKey); + final RecalDatum empiricalQualRG = (RecalDatum)table.get(key[0], errorModel.index); if (empiricalQualRG != null) { final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); final double aggregrateQReported = empiricalQualRG.getEstimatedQReported(); @@ -185,32 +173,28 @@ public class BaseRecalibration { return result; } - private double calculateDeltaQReported(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel, final double globalDeltaQ) { + private double calculateDeltaQReported(final NestedHashMap table, final int[] key, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { double result = 0.0; - final long masterKey = keyManager.createMasterKey(key, errorModel, -1); - final RecalDatum empiricalQualQS = table.get(masterKey); + final RecalDatum empiricalQualQS = (RecalDatum)table.get(key[0], key[1], errorModel.index); if (empiricalQualQS != null) { final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); - final byte qualFromRead = (byte)key[1]; result = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } return result; } - private double calculateDeltaQCovariates(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel, final double globalDeltaQ, final double deltaQReported) { + private double calculateDeltaQCovariates(final NestedHashMap table, final int[] key, final EventType errorModel, final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { double result = 0.0; - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - for (int i = 0; i < numOptionalCovariates; i++) { - final long masterKey = keyManager.createMasterKey(key, errorModel, i); - if (masterKey < 0) + // for all optional covariates + for (int i = 2; i < requestedCovariates.length; i++) { + if (key[i] < 0) continue; - final RecalDatum empiricalQualCO = table.get(masterKey); + final RecalDatum empiricalQualCO = (RecalDatum)table.get(key[0], key[1], (i-2), key[i], errorModel.index); if (empiricalQualCO != null) { final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); - final byte qualFromRead = (byte)key[1]; result += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java new file mode 100644 index 000000000..aa77b5142 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.utils.collections.NestedHashMap; + +/** + * Utility class to facilitate on-the-fly base quality score recalibration. + * + * User: ebanks + * Date: 6/20/12 + */ + +public class RecalibrationTables { + + public enum TableType { + READ_GROUP_TABLE(0), + QUALITY_SCORE_TABLE(1), + OPTIONAL_COVARIATE_TABLE(2); + + private final int index; + + private TableType(final int index) { + this.index = index; + } + } + + private final NestedHashMap[] tables = new NestedHashMap[TableType.values().length]; + + public RecalibrationTables(final NestedHashMap rgMap, final NestedHashMap qualMap, final NestedHashMap covMap) { + tables[TableType.READ_GROUP_TABLE.index] = rgMap; + tables[TableType.QUALITY_SCORE_TABLE.index] = qualMap; + tables[TableType.OPTIONAL_COVARIATE_TABLE.index] = covMap; + } + + public NestedHashMap getTable(final TableType type) { + return tables[type.index]; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index f45b0e615..d268aabc6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -554,7 +554,7 @@ public abstract class Genotype implements Comparable { pairs.add(k + "=" + c.get(k)); } - return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; + return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index fa41a3c99..d644eda7d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -34,6 +36,11 @@ import java.util.Arrays; import java.util.EnumMap; public class GenotypeLikelihoods { + private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; + private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10; + // caching numAlleles up to 5 and ploidy up to 10 + private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY]; + public final static int MAX_PL = Short.MAX_VALUE; // @@ -44,6 +51,29 @@ public class GenotypeLikelihoods { private double[] log10Likelihoods = null; private String likelihoodsAsString_PLs = null; + + /** + * initialize num likelihoods cache + */ + static { + // must be done before PLIndexToAlleleIndex + for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) { + for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) { + numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy); + } + } + } + + /** + * The maximum number of alleles that we can represent as genotype likelihoods + */ + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; + + /* + * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + */ + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); + public final static GenotypeLikelihoods fromPLField(String PLs) { return new GenotypeLikelihoods(PLs); } @@ -245,47 +275,11 @@ public class GenotypeLikelihoods { return likelihoodsAsVector; } -// // ------------------------------------------------------------------------------------- -// // -// // List interface functions -// // -// // ------------------------------------------------------------------------------------- -// -// private final void notImplemented() { -// throw new ReviewedStingException("BUG: code not implemented"); -// } -// -// @Override public int size() { return getAsVector().length; } -// @Override public Double get(final int i) { return getAsVector()[i];} -// @Override public Double set(final int i, final Double aDouble) { return getAsVector()[i] = aDouble; } -// @Override public boolean isEmpty() { return false; } -// @Override public Iterator iterator() { return Arrays.asList(ArrayUtils.toObject(getAsVector())).iterator(); } -// @Override public Object[] toArray() { return ArrayUtils.toObject(getAsVector()); } -// -// // none of these are implemented -// @Override public boolean contains(final Object o) { notImplemented(); return false; } -// @Override public T[] toArray(final T[] ts) { notImplemented(); return null; } -// @Override public boolean add(final Double aDouble) { notImplemented(); return false; } -// @Override public boolean remove(final Object o) {notImplemented(); return false; } -// @Override public boolean containsAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean addAll(final Collection doubles) { notImplemented(); return false; } -// @Override public boolean addAll(final int i, final Collection doubles) { notImplemented(); return false; } -// @Override public boolean removeAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean retainAll(final Collection objects) { notImplemented(); return false; } -// @Override public void clear() { notImplemented(); } -// @Override public void add(final int i, final Double aDouble) { notImplemented(); } -// @Override public Double remove(final int i) { notImplemented(); return null; } -// @Override public int indexOf(final Object o) { notImplemented(); return -1; } -// @Override public int lastIndexOf(final Object o) { notImplemented(); return 0; } -// @Override public ListIterator listIterator() { notImplemented(); return null; } -// @Override public ListIterator listIterator(final int i) { notImplemented(); return null; } -// @Override public List subList(final int i, final int i1) { notImplemented(); return null; } - -// ------------------------------------------------------------------------------------- -// -// Static conversion utilities, going from GL/PL index to allele index and vice versa. -// -// ------------------------------------------------------------------------------------- + // ------------------------------------------------------------------------------------- + // + // Static conversion utilities, going from GL/PL index to allele index and vice versa. + // + // ------------------------------------------------------------------------------------- /* * Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index. @@ -300,18 +294,8 @@ public class GenotypeLikelihoods { } } - /** - * The maximum number of alleles that we can represent as genotype likelihoods - */ - public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; - - /* - * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles - */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); - private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); + final int numLikelihoods = numLikelihoods(1 + altAlleles, 2); final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; // for all possible combinations of 2 alleles @@ -330,6 +314,32 @@ public class GenotypeLikelihoods { return cache; } + // ------------------------------------------------------------------------------------- + // + // num likelihoods given number of alleles and ploidy + // + // ------------------------------------------------------------------------------------- + + /** + * Actually does the computation in @see #numLikelihoods + * + * @param numAlleles + * @param ploidy + * @return + */ + private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) { + if (numAlleles == 1) + return 1; + else if (ploidy == 1) + return numAlleles; + else { + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += calcNumLikelihoods(numAlleles - 1, ploidy - k); + return acc; + } + } + /** * Compute how many likelihood elements are associated with the given number of alleles * Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P) @@ -344,6 +354,8 @@ public class GenotypeLikelihoods { * which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2) * In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2 * + * Note this method caches the value for most common num Allele / ploidy combinations for efficiency + * * Recursive implementation: * S(N,P) = sum_{k=0}^P S(N-1,P-k) * because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K @@ -355,23 +367,16 @@ public class GenotypeLikelihoods { * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { - - // fast, closed form solution for diploid samples (most common use case) - if (ploidy==2) - return numAlleles*(numAlleles+1)/2; - - if (numAlleles == 1) - return 1; - else if (ploidy == 1) - return numAlleles; - - int acc =0; - for (int k=0; k <= ploidy; k++ ) - acc += calculateNumLikelihoods(numAlleles-1, ploidy-k); - - return acc; - + @Requires({"ploidy > 0", "numAlleles > 0"}) + @Ensures("result > 0") + public static int numLikelihoods(final int numAlleles, final int ploidy) { + if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES + && ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY ) + return numLikelihoodCache[numAlleles][ploidy]; + else { + // have to calculate on the fly + return calcNumLikelihoods(numAlleles, ploidy); + } } // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index fc4175735..ba8668fa9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -61,6 +61,11 @@ public class GenotypesContext implements List { */ ArrayList notToBeDirectlyAccessedGenotypes; + /** + * Cached value of the maximum ploidy observed among all samples + */ + private int maxPloidy = -1; + /** Are we allowing users to modify the list? */ boolean immutable = false; @@ -408,6 +413,17 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + @Ensures("result >= 0") + public int getMaxPloidy() { + if ( maxPloidy == -1 ) { + maxPloidy = 0; // necessary in the case where there are no genotypes + for ( final Genotype g : getGenotypes() ) { + maxPloidy = Math.max(g.getPloidy(), maxPloidy); + } + } + return maxPloidy; + } + /** * Gets sample associated with this sampleName, or null if none is found * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 8908782f1..dc600d97c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -626,14 +626,13 @@ public class VariantContext implements Feature { // to enable tribble integratio /** * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * + * This function is caching, so it's only expensive on the first call + * * @return -1, or the max ploidy */ public int getMaxPloidy() { - int max = -1; - for ( final Genotype g : getGenotypes() ) { - max = Math.max(g.getPloidy(), max); - } - return max; + return genotypes.getMaxPloidy(); } /** @@ -1337,13 +1336,13 @@ public class VariantContext implements Feature { // to enable tribble integratio * @param header containing types about all fields in this VC * @return a fully decoded version of this VC */ - public VariantContext fullyDecode(final VCFHeader header) { + public VariantContext fullyDecode(final VCFHeader header, final boolean lenientDecoding) { if ( isFullyDecoded() ) return this; else { // TODO -- warning this is potentially very expensive as it creates copies over and over final VariantContextBuilder builder = new VariantContextBuilder(this); - fullyDecodeInfo(builder, header); + fullyDecodeInfo(builder, header, lenientDecoding); fullyDecodeGenotypes(builder, header); builder.fullyDecoded(true); return builder.make(); @@ -1358,13 +1357,13 @@ public class VariantContext implements Feature { // to enable tribble integratio return fullyDecoded; } - private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header) { - builder.attributes(fullyDecodeAttributes(getAttributes(), header, false)); + private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) { + builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding)); } private final Map fullyDecodeAttributes(final Map attributes, final VCFHeader header, - final boolean allowMissingValuesComparedToHeader) { + final boolean lenientDecoding) { final Map newAttributes = new HashMap(attributes.size()); for ( final Map.Entry attr : attributes.entrySet() ) { @@ -1377,11 +1376,11 @@ public class VariantContext implements Feature { // to enable tribble integratio final Object decoded = decodeValue(field, attr.getValue(), format); if ( decoded != null && - ! allowMissingValuesComparedToHeader + ! lenientDecoding && format.getCountType() != VCFHeaderLineCount.UNBOUNDED && format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1; - final int expSize = format.getCount(this.getNAlleles() - 1); + final int expSize = format.getCount(this); if ( obsSize != expSize ) { throw new UserException.MalformedVCFHeader("Discordant field size detected for field " + field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " + @@ -1431,7 +1430,7 @@ public class VariantContext implements Feature { // to enable tribble integratio switch ( format.getType() ) { case Character: return string; case Flag: - final boolean b = Boolean.valueOf(string); + final boolean b = Boolean.valueOf(string) || string.equals("1"); if ( b == false ) throw new UserException.MalformedVCF("VariantContext FLAG fields " + field + " cannot contain false values" + " as seen at " + getChr() + ":" + getStart()); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 83ddd2a1f..01d3ab456 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -159,16 +159,20 @@ public class VariantContextBuilder { return this; } - public VariantContextBuilder alleles(final String ... alleleStrings) { - List alleles = new ArrayList(alleleStrings.length); + public VariantContextBuilder alleles(final List alleleStrings) { + List alleles = new ArrayList(alleleStrings.size()); - for ( int i = 0; i < alleleStrings.length; i++ ) { - alleles.add(Allele.create(alleleStrings[i], i == 0)); + for ( int i = 0; i < alleleStrings.size(); i++ ) { + alleles.add(Allele.create(alleleStrings.get(i), i == 0)); } return alleles(alleles); } + public VariantContextBuilder alleles(final String ... alleleStrings) { + return alleles(Arrays.asList(alleleStrings)); + } + public List getAlleles() { return new ArrayList(alleles); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 223b4509b..ccc0f5971 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -46,6 +46,7 @@ public class VariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final public static JexlEngine engine = new JexlEngine(); public static final int DEFAULT_PLOIDY = 2; @@ -57,6 +58,31 @@ public class VariantContextUtils { engine.setDebug(false); } + /** + * Ensures that VC contains all of the samples in allSamples by adding missing samples to + * the resulting VC with default diploid ./. genotypes + * + * @param vc + * @param allSamples + * @return + */ + public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { + // TODO -- what's the fastest way to do this calculation? + final Set missingSamples = new HashSet(allSamples); + missingSamples.removeAll(vc.getSampleNames()); + + if ( missingSamples.isEmpty() ) + return vc; + else { + //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); + final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); + for ( final String missing : missingSamples ) { + gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); + } + return new VariantContextBuilder(vc).genotypes(gc).make(); + } + } + /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags @@ -1199,8 +1225,8 @@ public class VariantContextUtils { altAlleleIndexToUse[i] = true; } - // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index ecc1cd3e0..812e6dd07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -185,7 +185,7 @@ public abstract class BCF2FieldEncoder { @Requires("hasContextDeterminedNumElements()") @Ensures("result >= 0") public int numElements(final VariantContext vc) { - return headerLine.getCount(vc.getNAlleles() - 1); + return headerLine.getCount(vc); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index eada05578..5555849dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -155,7 +155,7 @@ class BCF2Writer extends IndexingVariantContextWriter { public void add( VariantContext vc ) { if ( doNotWriteGenotypes ) vc = new VariantContextBuilder(vc).noGenotypes().make(); - vc = vc.fullyDecode(header); + vc = vc.fullyDecode(header, false); super.add(vc); // allow on the fly indexing @@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) - // we don't have any data about g at all - g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make(); + if ( g == null ) VCFWriter.missingSampleError(vc, header); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index fcd3eb071..ee7b1b9ef 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { -// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) { -// final List badSampleNames = new ArrayList(); -// for ( final Genotype g : vc.getGenotypes() ) -// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) ) -// badSampleNames.add(g.getSampleName()); -// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames)); -// } - for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); if ( g == null ) { - // TODO -- The VariantContext needs to know what the general ploidy is of the samples - // TODO -- We shouldn't be assuming diploid genotypes here! - mWriter.write(VCFConstants.EMPTY_GENOTYPE); - continue; + missingSampleError(vc, mHeader); } List attrs = new ArrayList(genotypeFormatKeys.size()); @@ -402,7 +392,7 @@ class VCFWriter extends IndexingVariantContextWriter { VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); + int numInFormatField = metaData.getCount(vc); if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." @@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter { } } + public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { + final List badSampleNames = new ArrayList(); + for ( final String x : header.getGenotypeSamples() ) + if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); + throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); + } + private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); @@ -569,6 +566,6 @@ class VCFWriter extends IndexingVariantContextWriter { + " at " + vc.getChr() + ":" + vc.getStart() + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + " complete VCF headers by default. This error can be disabled with the engine argument" - + " --allowMissingVCFHeaders"); + + " -U LENIENT_VCF_PROCESSING"); } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 708dd042e..a997385d6 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -79,7 +79,7 @@ public class WalkerTest extends BaseTest { public void validateOutputBCFIfPossible(final String name, final File resultFile) { final File bcfFile = BCF2Utils.shadowBCF(resultFile); - if ( bcfFile.exists() ) { + if ( bcfFile != null && bcfFile.exists() ) { logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); try { VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 3ce62b697..2f8b1e9b5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -24,32 +24,17 @@ package org.broadinstitute.sting.gatk; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.util.Interval; -import net.sf.picard.util.IntervalList; -import net.sf.samtools.SAMFileHeader; -import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; -import org.broadinstitute.sting.commandline.IntervalBinding; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; -import java.util.List; - /** * Tests selected functionality in the GenomeAnalysisEngine class @@ -91,65 +76,4 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } - - @DataProvider(name="invalidIntervalTestData") - public Object[][] invalidIntervalDataProvider() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - GATKArgumentCollection argCollection = new GATKArgumentCollection(); - testEngine.setArguments(argCollection); - - File fastaFile = new File("public/testdata/exampleFASTA.fasta"); - GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); - testEngine.setGenomeLocParser(genomeLocParser); - - return new Object[][] { - new Object[] {testEngine, genomeLocParser, "chr1", 10000000, 20000000}, - new Object[] {testEngine, genomeLocParser, "chr2", 1, 2}, - new Object[] {testEngine, genomeLocParser, "chr1", -1, 50} - }; - } - - @Test(dataProvider="invalidIntervalTestData") - public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - SAMFileHeader picardFileHeader = new SAMFileHeader(); - picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); - IntervalList picardIntervals = new IntervalList(picardFileHeader); - picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); - - File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); - picardIntervals.write(picardIntervalFile); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - - testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); - } - - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") - public void testInvalidGATKFileIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", - String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - - testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); - } - - private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { - File tempFile = File.createTempFile(tempFilePrefix, tempFileExtension); - tempFile.deleteOnExit(); - - PrintWriter out = new PrintWriter(tempFile); - for ( String line : lines ) { - out.println(line); - } - out.close(); - - return tempFile; - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java index 9d14cd74c..4e6fe5939 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java @@ -58,8 +58,8 @@ public class TestRMDTrackBuilder extends RMDTrackBuilder { Index index; try { // Create a feature reader that creates checkable tribble iterators. + index = loadIndex(inputFile, codec); featureReader = new TestFeatureReader(inputFile.getAbsolutePath(), codec); - index = loadFromDisk(inputFile, Tribble.indexFile(inputFile)); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index b83ef67c4..0b45dc931 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -16,7 +16,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("55785745fe13ad81a2c4a14373d091f0")); + Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); executeTest("test file has annotations, not asking for annotations, #1", spec); } @@ -24,7 +24,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d6f749f8dbeb2d42c9effaff9fe571d7")); + Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); executeTest("test file has annotations, not asking for annotations, #2", spec); } @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("9084e6c7b1cec0f3a2c6d96711844d5e")); + Arrays.asList("e0a08416249515ea18bd0663c90c9330")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("3dfabdcaa2648ac34380fb71860c42d3")); + Arrays.asList("0b60da46ba0eabb3abe5e0288937f9b0")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -48,7 +48,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b85c1ea28194484b327fbe0add1b5685")); + Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); } @@ -58,7 +58,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { // they don't get reordered. It's a good test of the genotype ordering system. WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("fe4d4e2484c4cf8b1cd50ad42cfe468e")); + Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("043fc6205b0633edcd3fadc9e044800c")); + Arrays.asList("5eb576d0234c912d8efea184492691d0")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("6fafb42d374a67ba4687a23078a126af")); + Arrays.asList("8860524d793d24b2e32f318433fcf527")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("639462a0e0fa79e33def5f011fe55961")); + Arrays.asList("f33f417fad98c05d9cd08ffa22943b0f")); executeTest("test exclude annotations", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("afe6c9d3b4b80635a541cdfcfa48db2f")); + Arrays.asList("1c423b7730b9805e7b885ece924286e0")); executeTest("not passing it any reads", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("21d696ea8c55d2fd4cbb4dcd5f7f7db6")); + Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); executeTest("getting DB tag with dbSNP", spec); } @@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("ef95394c14d5c16682a322f3dfb9000c")); + Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("e6e276b7d517d57626c8409589cd286f")); + Arrays.asList("cc7184263975595a6e2473d153227146")); executeTest("getting DB tag with HM3", spec); } @@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoQuals() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("a99e8315571ed1b6bce942451b3d8612")); + Arrays.asList("aea983adc01cd059193538cc30adc17d")); executeTest("test file doesn't have QUALs", spec); } @@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("7d6ea3b54210620cbc7e14dad8836bcb")); + Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); executeTest("using expression", spec); } @@ -146,13 +146,13 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("35ce4fb0288dfc5c01ec6ce8b14c6157")); + Arrays.asList("3de1d1998203518098ffae233f3e2352")); executeTest("using expression with ID", spec); } @Test public void testTabixAnnotations() { - final String MD5 = "5aebcf8f76c649d645708b1262185c80"; + final String MD5 = "99938d1e197b8f10c408cac490a00a62"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, @@ -168,7 +168,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("0c20cda1cf0b903a287f1807ae5bee02") + Arrays.asList("d9291845ce5a8576898d293a829a05b7") ); executeTest("Testing SnpEff annotations", spec); } @@ -187,7 +187,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testTDTAnnotation() { - final String MD5 = "81f85f0ce8cc36df7c717c478e100ba1"; + final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, @@ -198,7 +198,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testChromosomeCountsPed() { - final String MD5 = "9830fe2247651377e68ad0b0894e9a4e"; + final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, @@ -208,7 +208,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testInbreedingCoeffPed() { - final String MD5 = "e94d589b5691e3ecfd9cc9475a384890"; + final String MD5 = "159a771c1deaeffb786097e106943893"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 99710831d..8fe96b53d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea")); + "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } @@ -51,7 +51,7 @@ public class BeagleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ProduceBeagleInput -R " + hg19Reference + " " + "--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + - "-o %s --allowMissingVCFHeaders", 1, Arrays.asList("f301b089d21da259873f04bdc468835d")); + "-o %s -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("f301b089d21da259873f04bdc468835d")); spec.disableShadowBCF(); executeTest("test BeagleInput", spec); } @@ -61,7 +61,7 @@ public class BeagleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ProduceBeagleInput --variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ "--validation:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ - "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 --allowMissingVCFHeaders -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2, + "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -U LENIENT_VCF_PROCESSING -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2, Arrays.asList("660986891b30cdc937e0f2a3a5743faa","4b6417f892ccfe5c63b8a60cb0ef3740")); spec.disableShadowBCF(); executeTest("test BeagleInputWithBootstrap",spec); @@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header --allowMissingVCFHeaders",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java deleted file mode 100644 index da1678d54..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java +++ /dev/null @@ -1,158 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -/** - * @author Mauricio Carneiro - * @since 3/7/12 - */ -public class BQSRKeyManagerUnitTest { - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - } - - @Test(enabled = false) - public void testCombineBitSets() { - final int nRequired = 2; - final ArrayList covariates = new ArrayList(); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - covariates.add(new CycleCovariate()); - covariates.add(new ContextCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = true) - public void testOnlyRequiredCovariates() { - final int nRequired = 2; - final ArrayList covariates = new ArrayList(2); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = true) - public void testOnlyOneCovariate() { - final int nRequired = 1; - final ArrayList covariates = new ArrayList(2); - covariates.add(new ReadGroupCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = false) - public void testOneCovariateWithOptionalCovariates() { - final int nRequired = 1; - final ArrayList covariates = new ArrayList(4); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - covariates.add(new CycleCovariate()); - covariates.add(new ContextCovariate()); - createReadAndTest(covariates, nRequired); - } - - private void createReadAndTest(List covariates, int nRequired) { - int readLength = 1000; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M"); - read.setReadGroup(new GATKSAMReadGroupRecord("ID")); - read.getReadGroup().setPlatform("illumina"); - - runTestOnRead(read, covariates, nRequired); - read.setReadNegativeStrandFlag(true); - runTestOnRead(read, covariates, nRequired); - read.setReadPairedFlag(true); - read.setSecondOfPairFlag(true); - runTestOnRead(read, covariates, nRequired); - read.setReadNegativeStrandFlag(false); - runTestOnRead(read, covariates, nRequired); - } - - private void runTestOnRead(GATKSAMRecord read, List covariateList, int nRequired) { - final long[][][] covariateKeys = new long[covariateList.size()][EventType.values().length][read.getReadLength()]; - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), covariateList.size()); - for (int i = 0; i < covariateList.size(); i++) { - final Covariate cov = covariateList.get(i); - cov.initialize(RAC); - readCovariates.setCovariateIndex(i); - cov.recordValues(read, readCovariates); - } - for (int i = 0; i < read.getReadLength(); i++) { - for (EventType eventType : EventType.values()) { - final long[] vals = readCovariates.getKeySet(i, eventType); - for (int j = 0; j < vals.length; j++) - covariateKeys[j][eventType.index][i] = vals[j]; - } - } - - List requiredCovariates = new LinkedList(); - List optionalCovariates = new LinkedList(); - - for (int j=0; j optionalCovariates, - final Object[] expectedRequired, final Object[] expectedCovariate, final EventType eventType, final int index) { - - Object[] actual = keyManager.keySetFrom(key).toArray(); - - // Build the expected array - Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)]; - System.arraycopy(expectedRequired, 0, expected, 0, nRequired); - if (optionalCovariates.size() > 0) { - expected[expected.length-3] = expectedCovariate[index]; - expected[expected.length-2] = optionalCovariates.get(index).getClass().getSimpleName().split("Covariate")[0]; - } - expected[expected.length-1] = eventType; - -// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual))); -// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected))); -// System.out.println(); - - for (int k = 0; k < expected.length; k++) - Assert.assertEquals(actual[k], expected[k]); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index ee5395454..553b7e237 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -36,7 +36,7 @@ public class ContextCovariateUnitTest { verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); } - public static void verifyCovariateArray(long[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { for (int i = 0; i < values.length; i++) Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java index 79b57fd8f..3fa1e916d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java @@ -47,7 +47,7 @@ public class CycleCovariateUnitTest { verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); } - private void verifyCovariateArray(long[][] values, int init, int increment) { + private void verifyCovariateArray(int[][] values, int init, int increment) { for (short i = 0; i < values.length; i++) { short actual = Short.decode(covariate.formatKey(values[i][0])); int expected = init + (increment * i); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java index 4970413e8..a83508353 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java @@ -46,8 +46,8 @@ public class ReadGroupCovariateUnitTest { } - private void verifyCovariateArray(long[][] values, String expected) { - for (long[] value : values) { + private void verifyCovariateArray(int[][] values, String expected) { + for (int[] value : values) { String actual = covariate.formatKey(value[0]); Assert.assertEquals(actual, expected); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java index e5fde0efc..d1f2d6342 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java @@ -1,7 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -32,7 +34,6 @@ public class RecalibrationReportUnitTest { final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final LinkedHashMap> keysAndTablesMap = new LinkedHashMap>(); quantizationInfo.noQuantization(); final List requiredCovariates = new LinkedList(); @@ -41,14 +42,10 @@ public class RecalibrationReportUnitTest { final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(rgKeyManager, new HashMap()); final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(qsKeyManager, new HashMap()); final ContextCovariate cxCovariate = new ContextCovariate(); cxCovariate.initialize(RAC); @@ -56,8 +53,6 @@ public class RecalibrationReportUnitTest { final CycleCovariate cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(cvKeyManager, new HashMap()); final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; int covariateIndex = 0; @@ -75,34 +70,35 @@ public class RecalibrationReportUnitTest { readQuals[i] = 20; read.setBaseQualities(readQuals); - final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); int nKeys = 0; // keep track of how many keys were produced final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); - for (int offset = 0; offset < length; offset++) { - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = entry.getKey(); - Map table = entry.getValue(); - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - if (numOptionalCovariates == 0) { - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION, -1), RecalDatum.createRandomRecalDatum(10000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_INSERTION, -1), RecalDatum.createRandomRecalDatum(100000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_DELETION, -1), RecalDatum.createRandomRecalDatum(100000, 10)); - nKeys += 3; - } else { - for (int j = 0; j < numOptionalCovariates; j++) { - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION, j), RecalDatum.createRandomRecalDatum(10000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_INSERTION, j), RecalDatum.createRandomRecalDatum(100000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_DELETION, j), RecalDatum.createRandomRecalDatum(100000, 10)); - nKeys += 3; - } + final NestedHashMap rgTable = new NestedHashMap(); + final NestedHashMap qualTable = new NestedHashMap(); + final NestedHashMap covTable = new NestedHashMap(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); + qualTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + covTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], j, covariates[2 + j], errorMode.index); + nKeys++; } } } Assert.assertEquals(nKeys, expectedKeys); - RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC); + final RecalibrationTables recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); + + final RecalibrationReport report = new RecalibrationReport(quantizationInfo, recalibrationTables, RAC.generateReportTable(), RAC); File output = new File("RecalibrationReportUnitTestOutuput.grp"); PrintStream out; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 355071e73..63b2d39f1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -36,18 +36,19 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { final String L = validationDataLocation + "DT-itest.interval_list"; private void DTTest(String testName, String args, String md5) { - String base = String.format("-T DiagnoseTargets -R %s -L %s", REF, L) + " -o %s "; + String base = String.format("-T DiagnoseTargets --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); + spec.disableShadowBCF(); executeTest(testName, spec); } @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "2df47009571fe83ead779c94be97fe96"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "ef71a569a48697c89e642cdda7bfb766"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "6f0c070b9671e1d007ce6374c3183014"); + DTTest("testMultiSample ", "-I " + multiSample, "1e6e15156e01e736274898fdac77d911"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 573f25b70..70a10a0b5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testNoAction() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbf88e25df30181ca5422a374c7b36fa")); + Arrays.asList("a890cd298298e22bc04a2e5a20b71170")); executeTest("test no action", spec); } @@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testClusteredSnps() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -window 10 --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("bb69f49e9ef0054f0ccd6d38f5ffa46a")); + Arrays.asList("f46b2fe2dbe6a423b5cfb10d74a4966d")); executeTest("test clustered SNPs", spec); } @@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask " + privateTestDir + "vcfexample2.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("7e3225a32fcd6066901247992b2c5ca8")); + Arrays.asList("86dbbf62a0623b2dc5e8969c26d8cb28")); executeTest("test mask all", spec1); } @@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF " + privateTestDir + "vcfMask.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("3485fe95e3f0864c3575baf05cef4bcc")); + Arrays.asList("2fb33fccda1eafeea7a2f8f9219baa39")); executeTest("test mask some", spec2); } @@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask3() { WalkerTestSpec spec3 = new WalkerTestSpec( baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + privateTestDir + "vcfMask.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("367ab9c028a68e4eda2055e3bb8b486c")); + Arrays.asList("4351e00bd9d821e37cded5a86100c973")); executeTest("test mask extend", spec3); } @@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("5a10d969e50a58d8dfbf1da54bf293df")); + Arrays.asList("2f056b50a41c8e6ba7645ff4c777966d")); executeTest("test filter #1", spec); } @@ -64,7 +64,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("886dbbca2350083819ff67224f6efbd6")); + Arrays.asList("b2a8c1a5d99505be79c03120e9d75f2f")); executeTest("test filter #2", spec); } @@ -72,7 +72,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilterWithSeparateNames() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("ee78c2e7128a8f9549233493c7cf6949")); + Arrays.asList("e350d9789bbdf334c1677506590d0798")); executeTest("test filter with separate names #2", spec); } @@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("285dd348c47c8c1e85d2886f9b33559e")); + Arrays.asList("c5ed9dd3975b3602293bb484b4fda5f4")); executeTest("test genotype filter #1", spec1); } @@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a9c835a13eb72aa22d5e271894d8ac33")); + Arrays.asList("979ccdf484259117aa31305701075602")); executeTest("test genotype filter #2", spec2); } @@ -96,7 +96,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testDeletions() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo --variant:VCF " + privateTestDir + "twoDeletions.vcf", 1, - Arrays.asList("a1c02a5a90f1262e9eb3d2cad1fd08f2")); + Arrays.asList("8077eb3bab5ff98f12085eb04176fdc9")); executeTest("test deletions", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index b3c85622e..19d1e4cb3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -29,7 +29,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("cd112ec37a9e28d366aff29a85fdcaa0","313cc749c7ee97713e4551de39e01ac5") + Arrays.asList("cd112ec37a9e28d366aff29a85fdcaa0","f8721f4f5d3bae2848ae15c3f120709b") ); executeTest("testTrueNegativeMV", spec); } @@ -48,7 +48,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("27ccd6feb51de7e7dcdf35f4697fa4eb","dd90dad9fd11e1b16e6660c3ca0553e7") + Arrays.asList("27ccd6feb51de7e7dcdf35f4697fa4eb","547fdfef393f3045a96d245ef6af8acb") ); executeTest("testTruePositiveMV", spec); } @@ -67,7 +67,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("719d681bb0a52a40bc854bba107c5c94","b35a86d2cad17f0db7b5e84ddc0e5545") + Arrays.asList("719d681bb0a52a40bc854bba107c5c94","9529e2bf214d72e792d93fbea22a3b91") ); executeTest("testFalsePositiveMV", spec); } @@ -86,7 +86,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("7f4a277aee2c7398fcfa84d6c98d5fb3","c53b5fd377bef48e9c6035a94db398db") + Arrays.asList("7f4a277aee2c7398fcfa84d6c98d5fb3","8c157d79dd00063d2932f0d2b96f53d8") ); executeTest("testSpecialCases", spec); } @@ -108,7 +108,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("44e09d2f9e4d8a9488226d03a97fe999","6f596470740e1a57679bbb38c0126364") + Arrays.asList("44e09d2f9e4d8a9488226d03a97fe999","343e418850ae4a687ebef2acd55fcb07") ); executeTest("testPriorOption", spec); } @@ -128,7 +128,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("b35a86d2cad17f0db7b5e84ddc0e5545") + Arrays.asList("9529e2bf214d72e792d93fbea22a3b91") ); executeTest("testMVFileOption", spec); } @@ -149,7 +149,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-fatherAlleleFirst" ), 2, - Arrays.asList("60ced3d078792a150a03640b62926857","6d550784382aa910f78b533d889c91c0") + Arrays.asList("60ced3d078792a150a03640b62926857","52ffa82428e63ade22ea37b72ae58492") ); executeTest("testFatherAlleleFirst", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index bb4b7a1be..11f1a0628 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -26,7 +26,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:332341-382503", 1, - Arrays.asList("442c819569417c1b7d6be9f41ce05394")); + Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); } @@ -36,7 +36,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:1232503-1332503", 1, - Arrays.asList("2a51ee7d3c024f2410dcee40c5412993")); + Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); } @@ -46,7 +46,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + " -L chr20:332341-382503", 1, - Arrays.asList("85bc9b03e24159f746dbd0cb988f9ec8")); + Arrays.asList("f685803333123a102ce1851d984cbd10")); executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); } @@ -56,7 +56,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + " -L chr20:332341-382503", 1, - Arrays.asList("96bb413a83c777ebbe622438e4565e8f")); + Arrays.asList("aaa7c25d118383639f273128d241e140")); executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); } @@ -66,7 +66,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + " -L chr20:332341-482503", 1, - Arrays.asList("7d2402f055d243e2208db9ea47973e13")); + Arrays.asList("418e29400762972e77bae4f73e16befe")); executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); } @@ -76,7 +76,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:652810-681757", 1, - Arrays.asList("72682b3f27c33580d2d4515653ba6de7")); + Arrays.asList("4c8f6190ecc86766baba3aba08542991")); executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 9bf01ad71..e0cda07d7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -7,10 +7,6 @@ import org.testng.annotations.DataProvider; import java.util.*; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { - static HashMap clusterFiles = new HashMap(); - static HashMap tranchesFiles = new HashMap(); - static HashMap inputVCFFiles = new HashMap(); - private static class VRTest { String inVCF; String tranchesMD5; @@ -27,7 +23,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "0ddd1e0e483d2eaf56004615cea23ec7", // tranches "b9709e4180e56abc691b208bd3e8626c", // recal file - "c58ff4140e8914f0b656ed625c7f73b9"); // cut VCF + "4c73ff0c8c5ae0055bfacf33329a2406"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -54,6 +50,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); } @@ -65,17 +62,18 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 20:12,000,000-30,000,000" + " --no_cmdline_in_header" + " -input " + params.inVCF + - " -o %s" + + " -U LENIENT_VCF_PROCESSING -o %s" + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibration-"+params.inVCF, spec); } VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", "da4458d05f6396f5c4ab96f274e5ccdc", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "05e88052e0798f1c1e83f0a8938bce56"); // cut VCF + "b9936d2432d3c85b2d8b5b7aa17d0950"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createData2() { @@ -101,6 +99,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); } @@ -111,12 +110,13 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -T ApplyRecalibration" + " -L 20:12,000,000-30,000,000" + " -mode INDEL" + - " --no_cmdline_in_header" + + " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + " -input " + params.inVCF + " -o %s" + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 981f00071..bbee99ba6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -38,14 +38,14 @@ public class CombineVariantsIntegrationTest extends WalkerTest { // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // - // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE allowMissingVCFHeaders ARGUMENTS + // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE lenientVCFProcessing ARGUMENTS // // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // private static String baseTestString(String args) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s --allowMissingVCFHeaders -R " + b36KGReference + args; + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; } private void cvExecuteTest(final String name, final WalkerTestSpec spec) { @@ -142,17 +142,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest { cvExecuteTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void complexTestFull() { combineComplexSites("", "8b19b54516b59de40992f0c4b328258a"); } - @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "a38dd097adc37420fe36ef8be14cfded"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "a3957dac9a617f50ce2668607e3baef0"); } - @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "a3957dac9a617f50ce2668607e3baef0"); } + @Test public void complexTestFull() { combineComplexSites("", "151a4970367dd3e73ba3e7f3c2f874f6"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "c0625e092b878b3d3eb1703c48e216b7"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "6978329d6a1033ac16f83b49072c679b"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "6978329d6a1033ac16f83b49072c679b"); } @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( "-T CombineVariants --no_cmdline_in_header -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, 1, - Arrays.asList("3d2a5a43db86e3f6217ed2a63251285b")); + Arrays.asList("aa926eae333208dc1f41fe69dc95d7a6")); cvExecuteTest("combineDBSNPDuplicateSites:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 31f704b85..21d49638f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -39,9 +39,9 @@ public class CombineVariantsUnitTest { "##fileformat=VCFv4.0\n"+ "##filedate=2010-06-21\n"+ "##reference=NCBI36\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ // string to integer + "##INFO=\n"+ // string to integer "##FILTER=\n"+ "##FORMAT=\n"+ "##FORMAT=\n"+ diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index 1711e6e3c..e14580ead 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -40,7 +40,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + privateTestDir + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("a139480c004859452d4095fe4859b42e")); + Arrays.asList("7d5f91fcf419211ae9eca6c66dcec0e6")); executeTest("test b36 to hg19", spec); } @@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + privateTestDir + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("91344768f1e98c979364ec0d5d3aa9d6")); + Arrays.asList("29dab3555e7f1ee6a60e267b00215a11")); executeTest("test b36 to hg19, unsorted samples", spec); } @@ -58,7 +58,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + hg18Reference + " --variant:vcf " + privateTestDir + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("e0b813ff873185ab51995a151f80ec98")); + Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d")); executeTest("test hg18 to hg19, unsorted", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index a23a22162..30cdbee36 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -18,7 +18,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("d88bdae45ae0e74e8d8fd196627e612c") ); @@ -34,7 +34,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, - Arrays.asList("337bb7fc23153cf67acc42a466834775") + Arrays.asList("3d98a024bf3aecbd282843e0af89d0e6") ); executeTest("testRepeatedLineSelection--" + testfile, spec); @@ -47,7 +47,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("54289033d35d32b8ebbb38c51fbb614c") ); @@ -64,7 +64,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); executeTest("testComplexSelection--" + testfile, spec); @@ -78,7 +78,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, 1, - Arrays.asList("bc0e00d0629b2bc6799e7e9db0dc775c") + Arrays.asList("1f5c72951a35667c4bdf1be153787e27") ); spec.disableShadowBCF(); @@ -93,7 +93,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") ); @@ -109,7 +109,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("a111642779b05de33ad04073d6022c21") + Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") ); executeTest("testVariantTypeSelection--" + testFile, spec); @@ -161,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("a0b7f77edc16df0992d2c1363136a17e") + Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") ); executeTest("testNoGTs--" + testFile, spec); @@ -176,7 +176,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); executeTest("testParallelization (2 threads)--" + testfile, spec); @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); @@ -204,7 +204,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("9acd6effcc78bfb832bed5edfd6a1b5b") + Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } @@ -223,7 +223,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { final String testFile = privateTestDir + "missingHeaderLine.vcf"; final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header" - + (expectedException == null ? " -allowMissingVCFHeaders" : ""); + + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); WalkerTestSpec spec = expectedException != null ? new WalkerTestSpec(cmd, 1, expectedException) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index a5cd49971..2b917ae0c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --no_cmdline_in_header " + " -o %s", 1, - Arrays.asList("2cdcd9e140eb1b6da7e365e37dd7d859") + Arrays.asList("283f434b3efbebb8e10ed6347f97d104") ); executeTest("testSimpleVCFStreaming", spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index eb79228e7..b0870b346 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -89,7 +89,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("95898aad8c9f9515c0e668e2fb65a024"); + md5.add("21084d32ce7ac5df3cee1730bfaaf71c"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + diff --git a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java index fd53283b1..32fe7597d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.walkers.bqsr.BQSRKeyManager; -import org.broadinstitute.sting.gatk.walkers.bqsr.ContextCovariate; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -59,19 +57,4 @@ public class BitSetUtilsUnitTest { //for (String d : dna) // Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d); } - - @Test(enabled = true) - public void testNumberOfBitsToRepresent() { - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented - - for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number - long n1 = 1L << i; - long n2 = Math.abs(random.nextLong()) % n1; - long n3 = n1 | n2; - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1); - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(n1), i); - } - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java index ef8a67d47..a0feef186 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java @@ -480,7 +480,8 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest { final byte typeDescriptor = decoder.readTypeDescriptor(); // read the int[] with the low-level version - final int[] decoded = decoder.decodeIntArray(typeDescriptor); + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final int[] decoded = decoder.decodeIntArray(typeDescriptor, size); if ( isMissing(ints) ) { // we expect that the result is null in this case diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java index 70460ae01..b8d6f2d1d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java @@ -33,13 +33,13 @@ public class VCFHeaderUnitTest extends BaseTest { @Test public void testVCF4ToVCF4() { VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "47d32e7901650ba69ed41322af638806"); + checkMD5ofHeaderFile(header, "f05a57053a0c6a5bac15dba566f7f7ff"); } @Test public void testVCF4ToVCF4_alternate() { VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "954e9dd756d5f952cfb401a4db6bd145"); + checkMD5ofHeaderFile(header, "b1d71cc94261053131f8d239d65a8c9f"); } /** @@ -112,7 +112,7 @@ public class VCFHeaderUnitTest extends BaseTest { "##reference=NCBI36\n"+ "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ @@ -132,7 +132,7 @@ public class VCFHeaderUnitTest extends BaseTest { "##reference=NCBI36\n"+ "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 422e890de..2a92b85e1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -12,7 +12,7 @@ public class VCFIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadingAndWritingWitHNoChanges() { - String md5ofInputVCF = "babf02baabcfa7f72a2c6f7da5fdc996"; + String md5ofInputVCF = "d991abe6c6a7a778a60a667717903be0"; String testVCF = privateTestDir + "vcf4.1.example.vcf"; String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; @@ -30,12 +30,11 @@ public class VCFIntegrationTest extends WalkerTest { // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic public void testReadingAndWritingBreakpointAlleles() { String testVCF = privateTestDir + "breakpoint-example.vcf"; - //String testVCF = validationDataLocation + "multiallelic.vcf"; String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("355b029487c3b4c499140d71310ca37e")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("13329ba7360a8beb3afc02569e5a20c4")); executeTest("Test reading and writing breakpoint VCF", spec1); } @@ -51,11 +50,20 @@ public class VCFIntegrationTest extends WalkerTest { } @Test - public void testReadingAndWritingSamtoolsWExBCFExample() { + public void testWritingSamtoolsWExBCFExample() { String testVCF = privateTestDir + "ex2.vcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("9773d6a121cfcb18d090965bc520f120")); - executeTest("Test reading and writing samtools WEx vcf/BCF example", spec1); + executeTest("Test writing samtools WEx BCF example", spec1); + } + + @Test + public void testReadingSamtoolsWExBCFExample() { + String testVCF = privateTestDir + "ex2.bcf"; + String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); + executeTest("Test reading samtools WEx BCF example", spec1); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 28573c600..3a9183e9a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.utils.interval; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.Interval; +import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.commons.io.FileUtils; import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -45,7 +49,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs = new ArrayList(); for (String interval: intervals) locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; + return Collections.unmodifiableList(locs); } @BeforeClass @@ -277,7 +281,10 @@ public class IntervalUtilsUnitTest extends BaseTest { listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, null); Assert.assertEquals(ret.size(), 100); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 0); @@ -296,7 +303,10 @@ public class IntervalUtilsUnitTest extends BaseTest { allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); Assert.assertEquals(ret.size(), 150); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 50); @@ -316,7 +326,10 @@ public class IntervalUtilsUnitTest extends BaseTest { } } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); Assert.assertEquals(ret.size(), 40); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 20); @@ -761,7 +774,13 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(privateTestDir + unmergedIntervals)); Assert.assertEquals(locs.size(), 2); - List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + List merged; + + merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + Assert.assertEquals(merged.size(), 1); + + // Test that null means the same as ALL + merged = IntervalUtils.mergeIntervalLocations(locs, null); Assert.assertEquals(merged.size(), 1); } @@ -993,6 +1012,74 @@ public class IntervalUtilsUnitTest extends BaseTest { // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: IntervalBinding binding = new IntervalBinding("1;2"); - List intervals = binding.getIntervals(toolkit); + binding.getIntervals(toolkit); + } + + @DataProvider(name="invalidIntervalTestData") + public Object[][] invalidIntervalDataProvider() throws Exception { + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + File fastaFile = new File("public/testdata/exampleFASTA.fasta"); + GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); + + return new Object[][] { + new Object[] {argCollection, genomeLocParser, "chr1", 10000000, 20000000}, + new Object[] {argCollection, genomeLocParser, "chr2", 1, 2}, + new Object[] {argCollection, genomeLocParser, "chr1", -1, 50} + }; + } + + @Test(dataProvider="invalidIntervalTestData") + public void testInvalidPicardIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + SAMFileHeader picardFileHeader = new SAMFileHeader(); + picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + IntervalList picardIntervals = new IntervalList(picardFileHeader); + picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); + + File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); + picardIntervals.write(picardIntervalFile); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidGATKFileIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", + String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + } + + private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { + File tempFile = BaseTest.createTempFile(tempFilePrefix, tempFileExtension); + FileUtils.writeLines(tempFile, Arrays.asList(lines)); + return tempFile; + } + + @DataProvider(name = "sortAndMergeIntervals") + public Object[][] getSortAndMergeIntervals() { + return new Object[][] { + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1", "chr1:2", "chr1:3") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1-3") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1"), getLocs("chr1") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1"), getLocs("chr1") } + }; + } + + @Test(dataProvider = "sortAndMergeIntervals") + public void testSortAndMergeIntervals(IntervalMergingRule merge, List unsorted, List expected) { + List sorted = IntervalUtils.sortAndMergeIntervals(hg18GenomeLocParser, unsorted, merge).toList(); + Assert.assertEquals(sorted, expected); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index f70466d4f..982ac03bd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.*; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -22,7 +21,7 @@ import java.util.*; public class BaseRecalibrationUnitTest { private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; - private LinkedHashMap> keysAndTablesMap; + private RecalibrationTables recalibrationTables; private ReadGroupCovariate rgCovariate; private QualityScoreCovariate qsCovariate; @@ -50,19 +49,14 @@ public class BaseRecalibrationUnitTest { List optionalCovariates = new ArrayList(); dataManager = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager(true, 4); - keysAndTablesMap = new LinkedHashMap>(); rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(rgKeyManager, new HashMap()); qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(qsKeyManager, new HashMap()); cxCovariate = new ContextCovariate(); cxCovariate.initialize(RAC); @@ -70,8 +64,6 @@ public class BaseRecalibrationUnitTest { cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(cvKeyManager, new HashMap()); final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; int covariateIndex = 0; @@ -82,10 +74,13 @@ public class BaseRecalibrationUnitTest { readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); - for (int i=0; i> mapEntry : keysAndTablesMap.entrySet()) { - final BQSRKeyManager keyManager = mapEntry.getKey(); - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - if (numOptionalCovariates == 0) { - final long masterKey = keyManager.createMasterKey(bitKeys, EventType.BASE_SUBSTITUTION, -1); - updateCovariateWithKeySet(mapEntry.getValue(), masterKey, newDatum); - } else { - for (int j = 0; j < numOptionalCovariates; j++) { - final long masterKey = keyManager.createMasterKey(bitKeys, EventType.BASE_SUBSTITUTION, j); - updateCovariateWithKeySet(mapEntry.getValue(), masterKey, newDatum); - } - } + + rgTable.put(newDatum, bitKeys[0], EventType.BASE_SUBSTITUTION.index); + qualTable.put(newDatum, bitKeys[0], bitKeys[1], EventType.BASE_SUBSTITUTION.index); + for (int j = 0; j < optionalCovariates.size(); j++) { + covTable.put(newDatum, bitKeys[0], bitKeys[1], j, bitKeys[2 + j], EventType.BASE_SUBSTITUTION.index); } } - dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); + + recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); + + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); List quantizedQuals = new ArrayList(); List qualCounts = new ArrayList(); @@ -121,16 +112,15 @@ public class BaseRecalibrationUnitTest { } QuantizationInfo quantizationInfo = new QuantizationInfo(quantizedQuals, qualCounts); quantizationInfo.noQuantization(); - baseRecalibration = new BaseRecalibration(quantizationInfo, keysAndTablesMap, requestedCovariates); + baseRecalibration = new BaseRecalibration(quantizationInfo, recalibrationTables, requestedCovariates); } @Test(enabled=false) public void testGoldStandardComparison() { - debugTables(); for (int i = 0; i < read.getReadLength(); i++) { - long [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); + int [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); Object [] objKey = buildObjectKey(bitKey); byte v2 = baseRecalibration.performSequentialQualityCalculation(bitKey, EventType.BASE_SUBSTITUTION); byte v1 = goldStandardSequentialCalculation(objKey); @@ -138,7 +128,7 @@ public class BaseRecalibrationUnitTest { } } - private Object[] buildObjectKey(long[] bitKey) { + private Object[] buildObjectKey(final int[] bitKey) { Object[] key = new Object[bitKey.length]; key[0] = rgCovariate.formatKey(bitKey[0]); key[1] = qsCovariate.formatKey(bitKey[1]); @@ -147,49 +137,6 @@ public class BaseRecalibrationUnitTest { return key; } - private void debugTables() { - System.out.println("\nV1 Table\n"); - System.out.println("ReadGroup Table:"); - NestedHashMap nestedTable = dataManager.getCollapsedTable(0); - printNestedHashMap(nestedTable.data, ""); - System.out.println("\nQualityScore Table:"); - nestedTable = dataManager.getCollapsedTable(1); - printNestedHashMap(nestedTable.data, ""); - System.out.println("\nCovariates Table:"); - nestedTable = dataManager.getCollapsedTable(2); - printNestedHashMap(nestedTable.data, ""); - nestedTable = dataManager.getCollapsedTable(3); - printNestedHashMap(nestedTable.data, ""); - - - int i = 0; - System.out.println("\nV2 Table\n"); - for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = mapEntry.getKey(); - Map table = mapEntry.getValue(); - switch(i++) { - case 0 : - System.out.println("ReadGroup Table:"); - break; - case 1 : - System.out.println("QualityScore Table:"); - break; - case 2 : - System.out.println("Covariates Table:"); - break; - } - for (Map.Entry entry : table.entrySet()) { - Long key = entry.getKey(); - RecalDatum datum = entry.getValue(); - List keySet = keyManager.keySetFrom(key); - System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum) + "," + datum.getEstimatedQReported()); - } - System.out.println(); - } - - - } - private static void printNestedHashMap(Map table, String output) { for (Object key : table.keySet()) { String ret; diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index abaf23132..69f42e1f9 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -100,10 +100,10 @@ public class GenotypeLikelihoodsUnitTest { for (int nAlleles=2; nAlleles<=5; nAlleles++) // simplest case: diploid - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); // some special cases: ploidy = 20, #alleles = 4 - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index c75e22041..528f3dd29 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -51,20 +51,28 @@ import java.util.*; public class VariantContextTestProvider { final protected static Logger logger = Logger.getLogger(VariantContextTestProvider.class); + final private static boolean ENABLE_GENOTYPE_TESTS = true; + final private static boolean ENABLE_A_AND_G_TESTS = true; final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; + final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = false; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; + final private static List TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); private static VCFHeader syntheticHeader; final static List TEST_DATAs = new ArrayList(); private static VariantContext ROOT; - private final static List testSourceVCFs = Arrays.asList( - new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"), - new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf") - ); + private final static List testSourceVCFs = new ArrayList(); + static { + testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf")); + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf")); + } public abstract static class VariantContextIOTest { public String toString() { @@ -148,7 +156,7 @@ public class VariantContextTestProvider { logger.warn("Reading records from " + file); for ( final VariantContext raw : x.getSecond() ) { if ( raw != null ) - fullyDecoded.add(raw.fullyDecode(x.getFirst())); + fullyDecoded.add(raw.fullyDecode(x.getFirst(), false)); } logger.warn("Done reading " + file); @@ -179,6 +187,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer); addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); @@ -241,7 +250,7 @@ public class VariantContextTestProvider { add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000))); add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); add(builder().attribute("INT3", null)); - add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20))); + add(builder().attribute("INT20", TWENTY_INTS)); add(builder().attribute("FLOAT1", 1.0)); add(builder().attribute("FLOAT1", 100.0)); @@ -267,9 +276,13 @@ public class VariantContextTestProvider { add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("VAR.INFO.STRING", null)); - addGenotypesToTestData(); + if ( ENABLE_GENOTYPE_TESTS ) { + addGenotypesToTestData(); + addComplexGenotypesTest(); + } - addComplexGenotypesTest(); + if ( ENABLE_A_AND_G_TESTS ) + addGenotypesAndGTests(); } private static void addGenotypesToTestData() { @@ -314,7 +327,6 @@ public class VariantContextTestProvider { } } - private static void addGenotypes( final VariantContext site) { // test ref/ref final Allele ref = site.getReference(); @@ -442,6 +454,11 @@ public class VariantContextTestProvider { attr("g1", ref, "INT3", 1, 2, 3), attr("g2", ref, "INT3")); + addGenotypeTests(site, + attr("g1", ref, "INT20", TWENTY_INTS), + attr("g2", ref, "INT20", TWENTY_INTS)); + + if (ENABLE_VARARRAY_TESTS) { addGenotypeTests(site, attr("g1", ref, "INT.VAR", 1, 2, 3), @@ -515,6 +532,46 @@ public class VariantContextTestProvider { } } + private static void addGenotypesAndGTests() { +// for ( final int ploidy : Arrays.asList(2)) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) { + final List> alleleCombinations = + Arrays.asList( + Arrays.asList("A"), + Arrays.asList("A", "C"), + Arrays.asList("A", "C", "G"), + Arrays.asList("A", "C", "G", "T")); + + for ( final List alleles : alleleCombinations ) { + final VariantContextBuilder vcb = builder().alleles(alleles); + final VariantContext site = vcb.make(); + final int nAlleles = site.getNAlleles(); + final Allele ref = site.getReference(); + + // base genotype is ref/.../ref up to ploidy + final List baseGenotype = new ArrayList(ploidy); + for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref); + final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy); + + // ada is 0, 1, ..., nAlleles - 1 + final List ada = new ArrayList(nAlleles); + for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i); + + // pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy) + final int[] pl = new int[nPLs]; + for ( int i = 0; i < pl.length; i++ ) pl[i] = i; + + final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE"); + gb.alleles(baseGenotype); + gb.PL(pl); + gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada); + vcb.genotypes(gb.make()); + + add(vcb); + } + } + } + private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { if ( value.length == 0 ) return GenotypeBuilder.create(name, Arrays.asList(ref, ref)); @@ -598,7 +655,7 @@ public class VariantContextTestProvider { public VariantContext next() { try { final VariantContext vc = codec.decode(pbs); - return vc == null ? null : vc.fullyDecode(header); + return vc == null ? null : vc.fullyDecode(header, false); } catch ( IOException e ) { throw new RuntimeException(e); } @@ -646,20 +703,23 @@ public class VariantContextTestProvider { * @param expected */ public static void assertEquals( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual); - Assert.assertEquals(actual.getChr(), expected.getChr()); - Assert.assertEquals(actual.getStart(), expected.getStart()); - Assert.assertEquals(actual.getEnd(), expected.getEnd()); - Assert.assertEquals(actual.getID(), expected.getID()); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.getFilters(), expected.getFilters()); + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "filters"); BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes()); + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); if ( expected.hasGenotypes() ) { - Assert.assertEquals(actual.getSampleNames(), expected.getSampleNames()); + final Set actualSampleSet = new HashSet(actual.getSampleNames()); + final Set expectedSampleSet = new HashSet(expected.getSampleNames()); + Assert.assertTrue(actualSampleSet.equals(expectedSampleSet), "sample names"); // note this is necessary due to testng bug for set comps + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); final Set samples = expected.getSampleNames(); for ( final String sample : samples ) { assertEquals(actual.getGenotype(sample), expected.getGenotype(sample)); @@ -668,33 +728,33 @@ public class VariantContextTestProvider { } public static void assertEquals(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName()); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString()); - Assert.assertEquals(actual.getType(), expected.getType()); + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters()); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered()); + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP()); - Assert.assertEquals(actual.getAD(), expected.getAD()); - Assert.assertEquals(actual.getGQ(), expected.getGQ()); - Assert.assertEquals(actual.hasPL(), expected.hasPL()); - Assert.assertEquals(actual.hasAD(), expected.hasAD()); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ()); - Assert.assertEquals(actual.hasDP(), expected.hasDP()); + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad"); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods()); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString()); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods()); - Assert.assertEquals(actual.getPL(), expected.getPL()); + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertEquals(actual.getPL(), expected.getPL(), "Genotype getPL"); - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased()); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); } private static void assertAttributesEquals(final Map actual, Map expected) { @@ -706,16 +766,16 @@ public class VariantContextTestProvider { final Object expectedValue = expected.get(act.getKey()); if ( expectedValue instanceof List ) { final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List); + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size()); + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributesEquals(actualList.get(i), expectedList.get(i)); + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); } else - assertAttributesEquals(actualValue, expectedValue); + assertAttributeEquals(act.getKey(), actualValue, expectedValue); } else { // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue); + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); } expectedKeys.remove(act.getKey()); } @@ -724,7 +784,7 @@ public class VariantContextTestProvider { // and they must all be null for ( final String missingExpected : expectedKeys ) { final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value)); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); } } @@ -741,12 +801,12 @@ public class VariantContextTestProvider { return false; } - private static void assertAttributesEquals(final Object actual, final Object expected) { + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { if ( expected instanceof Double ) { // must be very tolerant because doubles are being rounded to 2 sig figs BaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2); } else - Assert.assertEquals(actual, expected); + Assert.assertEquals(actual, expected, "Attribute " + key); } public static void addComplexGenotypesTest() { @@ -816,14 +876,14 @@ public class VariantContextTestProvider { } public static void assertEquals(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size()); + Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size(), "No VCF header lines"); // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? //Assert.assertEquals(actual.getMetaData(), expected.getMetaData()); final List actualLines = new ArrayList(actual.getMetaData()); final List expectedLines = new ArrayList(expected.getMetaData()); for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i)); + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 9ecffe939..1b791bf6c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -121,7 +121,7 @@ public class VariantContextWritersUnitTest extends BaseTest { final List fullyDecoded = new ArrayList(vcsAfterIO.size()); for ( final VariantContext withStrings : vcsAfterIO ) - fullyDecoded.add(withStrings.fullyDecode(header)); + fullyDecoded.add(withStrings.fullyDecode(header, false)); return fullyDecoded; } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala index 2caa4d2aa..078331602 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue.engine -import java.util.Date import java.text.SimpleDateFormat /** @@ -36,18 +35,21 @@ class JobRunInfo { val formatter = new SimpleDateFormat("yy-MM-dd H:mm:ss:SSS"); /** The start time with millisecond resolution of this job */ - var startTime: Date = _ + var startTime: java.util.Date = _ /** The done time with millisecond resolution of this job */ - var doneTime: Date = _ + var doneTime: java.util.Date = _ var exechosts: String = "localhost" - def getStartTime = startTime - def getDoneTime = doneTime - def getFormattedStartTime = formatTime(getStartTime) - def getFormattedDoneTime = formatTime(getDoneTime) + def getStartTime: String = getTime(startTime) + def getDoneTime: String = getTime(doneTime) + def getFormattedStartTime = formatTime(startTime) + def getFormattedDoneTime = formatTime(doneTime) + + /** Helper function that returns the time of the date */ + private def getTime(d: java.util.Date): String = if ( d != null ) d.getTime.toString else "null" /** Helper function that pretty prints the date */ - private def formatTime(d: Date) = if ( d != null ) formatter.format(d) else "null" + private def formatTime(d: java.util.Date): String = if ( d != null ) formatter.format(d) else "null" def getExecHosts = exechosts @@ -55,14 +57,14 @@ class JobRunInfo { * Was any information set for this jobInfo? JobInfo can be unset because * the job never ran or because it already completed. */ - def isFilledIn = startTime != null + def isFilledIn = startTime != null && doneTime != null /** * How long did the job run (in wall time)? Returns -1 if this jobInfo isn't filled in */ def getRuntimeInMs: Long = { if ( isFilledIn ) - getDoneTime.getTime - getStartTime.getTime + doneTime.getTime - startTime.getTime else -1 } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 239f83482..76cefe2a5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -35,7 +35,7 @@ import org.ggf.drmaa.Session class GridEngineJobRunner(session: Session, function: CommandLineFunction) extends DrmaaJobRunner(session, function) with Logging { // Grid Engine disallows certain characters from being in job names. // This replaces all illegal characters with underscores - protected override val jobNameFilter = """[\n\t\r/:@\\*?]""" + protected override val jobNameFilter = """[\n\t\r/:,@\\*?]""" protected override val minRunnerPriority = -1023 protected override val maxRunnerPriority = 0 diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index 2609c3607..97669030a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -38,11 +38,11 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount protected override def maxIntervals = { - GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size + GATKScatterFunction.getGATKIntervals(this.originalGATK).contigs.size } def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 2f604a809..e619c0a02 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -26,13 +26,23 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils} +import org.broadinstitute.sting.utils.interval.{IntervalSetRule, IntervalMergingRule, IntervalUtils} import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} +import org.broadinstitute.sting.utils.GenomeLoc +import org.broadinstitute.sting.commandline._ +import org.broad.tribble.Feature + +case class GATKIntervals(reference: File, intervals: Seq[File], intervalsString: Seq[String], + intervalSetRule: IntervalSetRule, intervalMergingRule: IntervalMergingRule, intervalPadding: Option[Int], + excludeIntervals: Seq[File], excludeIntervalsString: Seq[String]) { + + def this(gatk: CommandLineGATK) = this( + gatk.reference_sequence, + gatk.intervals, gatk.intervalsString, + gatk.interval_set_rule, gatk.interval_merging, gatk.interval_padding, + gatk.excludeIntervals, gatk.excludeIntervalsString) -case class GATKIntervals(reference: File, intervals: Seq[String]) { private lazy val referenceDataSource = new ReferenceDataSource(reference) lazy val samFileHeader = { @@ -42,16 +52,46 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { } lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals) - Collections.sort(parsedLocs) - val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) - Collections.unmodifiableList(mergedLocs) + val includeIntervalBindings = this.intervals.map(GATKIntervals.createBinding(_, "intervals")) ++ + this.intervalsString.map(GATKIntervals.createBinding(_, "intervalsString")) + val excludeIntervalBindings = this.excludeIntervals.map(GATKIntervals.createBinding(_, "excludeIntervals")) ++ + this.excludeIntervalsString.map(GATKIntervals.createBinding(_, "excludeIntervalsString")) + + IntervalUtils.parseIntervalBindings( + referenceDataSource, + includeIntervalBindings, + intervalSetRule, intervalMergingRule, intervalPadding.getOrElse(0), + excludeIntervalBindings).toList } lazy val contigs = locs.map(_.getContig).distinct.toSeq } + +object GATKIntervals { + def copyIntervalArguments(src: CommandLineGATK, dst: CommandLineGATK) { + dst.reference_sequence = src.reference_sequence + dst.intervals = src.intervals + dst.intervalsString = src.intervalsString + dst.interval_set_rule = src.interval_set_rule + dst.interval_merging = src.interval_merging + dst.interval_padding = src.interval_padding + dst.excludeIntervals = src.excludeIntervals + dst.excludeIntervalsString = src.excludeIntervalsString + } + + private def createBinding(interval: File, argumentName: String): IntervalBinding[Feature] = { + val tags = interval match { + case taggedFile: TaggedFile => ParsingMethod.parseTags(argumentName, taggedFile.tag) + case file: File => new Tags + } + createBinding(interval.getAbsolutePath, argumentName, tags) + } + + private def createBinding(interval: String, argumentName: String): IntervalBinding[Feature] = { + createBinding(interval, argumentName, new Tags) + } + + private def createBinding(interval: String, argumentName: String, tags: Tags): IntervalBinding[Feature] = { + ArgumentTypeDescriptor.parseBinding(interval, classOf[Feature], classOf[IntervalBinding[Feature]], argumentName, tags, argumentName).asInstanceOf[IntervalBinding[Feature]] + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index 28c3f41e9..9e79e8f61 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -28,14 +28,17 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils import java.io.File import org.broadinstitute.sting.utils.io.IOUtils import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction} -import org.broadinstitute.sting.commandline.Output +import org.broadinstitute.sting.commandline._ trait GATKScatterFunction extends ScatterFunction { - /** The runtime field to set for specifying an interval file. */ + /* The runtime field to set for specifying intervals. */ private final val intervalsField = "intervals" - - /** The runtime field to set for specifying an interval string. */ private final val intervalsStringField = "intervalsString" + private final val excludeIntervalsField = "excludeIntervals" + private final val excludeIntervalsStringField = "excludeIntervalsString" + private final val intervalsSetRuleField = "interval_set_rule" + private final val intervalMergingField = "interval_merging" + private final val intervalPaddingField = "interval_padding" @Output(doc="Scatter function outputs") var scatterOutputFiles: Seq[File] = Nil @@ -43,25 +46,14 @@ trait GATKScatterFunction extends ScatterFunction { /** The original GATK function. */ protected var originalGATK: CommandLineGATK = _ - /** The reference sequence for the GATK function. */ - protected var referenceSequence: File = _ - - /** The list of interval files ("/path/to/interval.list") or interval strings ("chr1", "chr2") to parse into smaller parts. */ - protected var intervals: Seq[String] = Nil - /** Whether the last scatter job should also include any unmapped reads. */ protected var includeUnmapped: Boolean = _ override def init() { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] - this.referenceSequence = this.originalGATK.reference_sequence - if (this.originalGATK.intervals.isEmpty && (this.originalGATK.intervalsString == null || this.originalGATK.intervalsString.isEmpty)) { - this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, Seq.empty[String]).contigs - } else { - this.intervals ++= this.originalGATK.intervals.map(_.toString) - this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) + // If intervals have been specified check if unmapped is included + if (this.originalGATK.intervals.size + this.originalGATK.intervalsString.size > 0) this.includeUnmapped = this.originalGATK.intervalsString.exists(interval => IntervalUtils.isUnmapped(interval)) - } } override def isScatterGatherable = { @@ -74,6 +66,12 @@ trait GATKScatterFunction extends ScatterFunction { cloneFunction.setFieldValue(this.intervalsStringField, Seq("unmapped")) else cloneFunction.setFieldValue(this.intervalsStringField, Seq.empty[String]) + + cloneFunction.setFieldValue(this.intervalsSetRuleField, null) + cloneFunction.setFieldValue(this.intervalMergingField, null) + cloneFunction.setFieldValue(this.intervalPaddingField, None) + cloneFunction.setFieldValue(this.excludeIntervalsField, Seq.empty[File]) + cloneFunction.setFieldValue(this.excludeIntervalsStringField, Seq.empty[String]) } override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) { @@ -85,29 +83,28 @@ trait GATKScatterFunction extends ScatterFunction { } /** - * Returns true if all interval files exist. + * @return true if all interval files exist. */ protected def intervalFilesExist = { - !this.intervals.exists(interval => IntervalUtils.isIntervalFile(interval, false) && !new File(interval).exists) + !(this.originalGATK.intervals ++ this.originalGATK.excludeIntervals).exists(interval => !interval.exists()) } /** - * Returns the maximum number of intervals or this.scatterCount if the maximum can't be determined ahead of time. * @return the maximum number of intervals or this.scatterCount if the maximum can't be determined ahead of time. */ protected def maxIntervals: Int } object GATKScatterFunction { - var gatkIntervals = Seq.empty[GATKIntervals] + var gatkIntervalsCache = Seq.empty[GATKIntervals] - def getGATKIntervals(reference: File, intervals: Seq[String]) = { - gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match { - case Some(gi) => gi + def getGATKIntervals(originalFunction: CommandLineGATK) = { + val gatkIntervals = new GATKIntervals(originalFunction) + gatkIntervalsCache.find(_ == gatkIntervals) match { + case Some(existingGatkIntervals) => existingGatkIntervals case None => - val gi = new GATKIntervals(reference, intervals) - gatkIntervals :+= gi - gi + gatkIntervalsCache :+= gatkIntervals + gatkIntervals } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index 40a6fc4b4..03b142bca 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -33,12 +33,12 @@ import org.broadinstitute.sting.queue.function.InProcessFunction */ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = - GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size + GATKScatterFunction.getGATKIntervals(this.originalGATK).locs.size override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 8f52b9b82..150df4e38 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -31,13 +31,11 @@ import org.broadinstitute.sting.queue.function.InProcessFunction /** * A scatter function that divides down to the locus level. */ -//class LocusScatterFunction extends IntervalScatterFunction { } - class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = scatterCount def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 11a66a37b..7862dec41 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -37,14 +37,11 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] - override def freezeFieldValues { + override def freezeFieldValues() { this.jarFile = this.originalGATK.jarFile - this.reference_sequence = this.originalGATK.reference_sequence - this.intervals = this.originalGATK.intervals - this.intervalsString = this.originalGATK.intervalsString - this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput + GATKIntervals.copyIntervalArguments(this.originalGATK, this) // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK @@ -55,6 +52,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean] - super.freezeFieldValues + super.freezeFieldValues() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index 534d68069..13448afdd 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -49,6 +49,11 @@ trait JavaCommandLineFunction extends CommandLineFunction { */ var javaMemoryLimit: Option[Double] = None + /** + * Max number of GC threads + */ + var javaGCThreads: Option[Int] = None + override def freezeFieldValues() { super.freezeFieldValues() @@ -73,6 +78,8 @@ trait JavaCommandLineFunction extends CommandLineFunction { } def javaOpts = optional("-Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m", spaceSeparated=false) + + conditional(javaGCThreads.isDefined, "-XX:+UseParallelOldGC") + + optional("-XX:ParallelGCThreads=", javaGCThreads, spaceSeparated=false) + required("-Djava.io.tmpdir=", jobTempDir, spaceSeparated=false) def commandLine = required("java") + diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index 73ab7c366..c69a310b3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -56,8 +56,8 @@ trait QJobReport extends Logging { "jobName" -> QJobReport.workAroundSameJobNames(this), "intermediate" -> self.isIntermediate, "exechosts" -> info.getExecHosts, - "startTime" -> info.getStartTime.getTime, - "doneTime" -> info.getDoneTime.getTime, + "startTime" -> info.getStartTime, + "doneTime" -> info.getDoneTime, "formattedStartTime" -> info.getFormattedStartTime, "formattedDoneTime" -> info.getFormattedDoneTime, "runtime" -> info.getRuntimeInMs).mapValues((x:Any) => if (x != null) x.toString else "null") diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 6b615e6d9..0d8edc25d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -70,7 +70,7 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @return The parsed object. */ def parse(parsingEngine: ParsingEngine, source: ArgumentSource, typeType: Type, argumentMatches: ArgumentMatches) = { - parse(parsingEngine,source, makeRawTypeIfNecessary(typeType), argumentMatches) + parse(parsingEngine,source, ArgumentTypeDescriptor.makeRawTypeIfNecessary(typeType), argumentMatches) } def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b23350557..2c6016c9b 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -26,19 +26,21 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import org.testng.Assert -import org.testng.annotations.Test +import org.testng.annotations.{DataProvider, Test} import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import collection.JavaConversions._ import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.utils.exceptions.UserException class GATKIntervalsUnitTest { private final lazy val hg18Reference = new File(BaseTest.hg18Reference) private final lazy val hg18GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg18Reference)) private final lazy val hg18ReferenceLocs = GenomeLocSortedSet. createSetFromSequenceDictionary(new ReferenceDataSource(hg18Reference).getReference.getSequenceDictionary).toList + private final lazy val hg19GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg19Reference)) private final lazy val hg19Reference = new File(BaseTest.hg19Reference) @@ -48,14 +50,14 @@ class GATKIntervalsUnitTest { val chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-3") val chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:3-5") - val gi = new GATKIntervals(hg18Reference, Seq("chr1:1-1", "chr2:2-3", "chr3:3-5")) + val gi = createGATKIntervals(hg18Reference, Seq("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toSeq, Seq(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, Seq("chr1", "chr2", "chr3")) } @Test(timeOut = 30000L) def testIntervalFile() { - var gi = new GATKIntervals(hg19Reference, Seq(BaseTest.hg19Intervals)) + val gi = createGATKIntervals(hg19Reference, Seq(BaseTest.hg19Intervals)) Assert.assertEquals(gi.locs.size, 189894) // Timeout check is because of bad: // for(Item item: javaConvertedScalaList) @@ -67,28 +69,85 @@ class GATKIntervalsUnitTest { @Test def testEmptyIntervals() { - val gi = new GATKIntervals(hg18Reference, Nil) + val gi = createGATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) } @Test def testContigCounts() { - Assert.assertEquals(new GATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) - Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) - Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) + Assert.assertEquals(createGATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) + Assert.assertEquals(createGATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) + Assert.assertEquals(createGATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) } - @Test - def testSortAndMergeIntervals() { - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30")) + @DataProvider(name="sortAndMergeIntervals") + def getSortAndMergeIntervals: Array[Array[AnyRef]] = { + Array( + Array(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")), + Array(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")), + Array(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-30")), + Array(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-30")), + Array(Seq("chr1:1-9", "chr1:21-30", "chr1:11-20"), Seq("chr1:1-9", "chr1:11-30")) + ).asInstanceOf[Array[Array[AnyRef]]] } - private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) { - Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + @Test(dataProvider="sortAndMergeIntervals") + def testSortAndMergeIntervals(unmerged: Seq[String], expected: Seq[String]) { + Assert.assertEquals(createGATKIntervals(hg18Reference, unmerged).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + } + + @DataProvider(name="taggedFiles") + def getTaggedFiles: Array[Array[AnyRef]] = { + Array( + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", null, Seq("chr1:1-10")), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "", Seq("chr1:1-10")), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "myList", Seq("chr1:1-10")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", null, Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVcf", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "VCF", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVcf,VCF", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", null, Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "BED", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed,BED", Seq("20:1-999", "20:1002-2000", "22:1001-6000")) + ) + } + + @Test(dataProvider="taggedFiles") + def testTaggedFiles(reference: File, file: String, tags: String, expected: Seq[String]) { + val gatk = new CommandLineGATK + gatk.reference_sequence = reference + gatk.intervals = Seq(new TaggedFile(file, tags)) + val parser = if (reference == hg18Reference) hg18GenomeLocParser else hg19GenomeLocParser + Assert.assertEquals(new GATKIntervals(gatk).locs.toSeq, expected.map(parser.parseGenomeLoc(_))) + } + + @DataProvider(name="badTaggedFiles") + def getBadTaggedFiles: Array[Array[AnyRef]] = { + Array( + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "VCF"), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "too,many,tags"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "BED"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "VCF,myVCF"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVCF,VCF,extra"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "VCF"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "BED,myBed"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed,BED,extra") + ).asInstanceOf[Array[Array[AnyRef]]] + } + + @Test(dataProvider = "badTaggedFiles", expectedExceptions = Array(classOf[UserException])) + def testBadTaggedFiles(reference: File, file: String, tags: String) { + testTaggedFiles(reference, file, tags, Nil) + } + + private def createGATKIntervals(reference: File, intervals: Seq[String]) = { + val gatk = new CommandLineGATK + gatk.reference_sequence = reference + gatk.intervalsString = intervals + new GATKIntervals(gatk) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index a3c5f5144..20458c7c4 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -39,7 +39,6 @@ class DataProcessingPipelineTest { " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", " -i " + BaseTest.publicTestDir + "exampleBAM.bam", " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", - " -nv ", " -test ", " -p " + projectName).mkString spec.fileMD5s += testOut -> "0de95b5642e41e11ecd6fa1770242b88" @@ -57,7 +56,6 @@ class DataProcessingPipelineTest { " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", " -i " + BaseTest.publicTestDir + "exampleBAM.bam", " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", - " -nv ", " -test ", " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index 6e37ae2a3..c9d8b59c9 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -42,4 +42,43 @@ class ExampleUnifiedGenotyperPipelineTest { spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } + + @Test + def testUnifiedGenotyperWithGatkIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_gatk_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.intervals").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testUnifiedGenotyperWithBedIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_bed_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.bed").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testUnifiedGenotyperWithVcfIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_vcf_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.1.vcf").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } } diff --git a/settings/repository/org.broad/tribble-107.xml b/settings/repository/org.broad/tribble-107.xml deleted file mode 100644 index 0d3a50baa..000000000 --- a/settings/repository/org.broad/tribble-107.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-107.jar b/settings/repository/org.broad/tribble-110.jar similarity index 91% rename from settings/repository/org.broad/tribble-107.jar rename to settings/repository/org.broad/tribble-110.jar index 7157387ee..f8e312ad9 100644 Binary files a/settings/repository/org.broad/tribble-107.jar and b/settings/repository/org.broad/tribble-110.jar differ diff --git a/settings/repository/org.broad/tribble-110.xml b/settings/repository/org.broad/tribble-110.xml new file mode 100644 index 000000000..84a550b27 --- /dev/null +++ b/settings/repository/org.broad/tribble-110.xml @@ -0,0 +1,3 @@ + + +