Merge branch 'master' of ssh://gsa4.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2012-08-03 12:24:37 -04:00 · 2012-08-03 12:24:37 -04:00 · 6f8e7692d4
parent 9e25b209e0 8817fc70d1
commit 6f8e7692d4
34 changed files with 346 additions and 111 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -241,6 +241,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
        Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
        samplesList.addAll( samples );
        // initialize the UnifiedGenotyper Engine which is used to call into the exact model
+        UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument
        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
        UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
        UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
--- a/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R
+++ b/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R
@ -1,4 +1,5 @@
 library("ggplot2")
+library("tools") #For compactPDF in R 2.13+

 args <- commandArgs(TRUE)
 data <- read.csv(args[1])
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -797,6 +797,14 @@ public class GenomeAnalysisEngine {
        if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
            throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");

+        if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
+            throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
+
+        boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class);
+
+        if (argCollection.keepProgramRecords)
+            removeProgramRecords = false;
+
        return new SAMDataSource(
                samReaderIDs,
                threadAllocation,
@ -813,7 +821,8 @@ public class GenomeAnalysisEngine {
                getWalkerBAQQualityMode(),
                refReader,
                getBaseRecalibration(),
-                argCollection.defaultBaseQualities);
+                argCollection.defaultBaseQualities,
+                removeProgramRecords);
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@ -249,6 +249,12 @@ public class GATKArgumentCollection {
    @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
    public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;

+    @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false)
+    public boolean removeProgramRecords = false;
+
+    @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false)
+    public boolean keepProgramRecords = false;
+
    @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.", required = false)
    public ValidationExclusion.TYPE unsafe;

--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@ -89,6 +89,11 @@ public class SAMDataSource {
     */
    private final SAMFileReader.ValidationStringency validationStringency;

+    /**
+     * Do we want to remove the program records from this data source?
+     */
+    private final boolean removeProgramRecords;
+
    /**
     * Store BAM indices for each reader present.
     */
@ -200,7 +205,8 @@ public class SAMDataSource {
                BAQ.QualityMode.DONT_MODIFY,
                null, // no BAQ
                null, // no BQSR
-                (byte) -1);
+                (byte) -1,
+                false);
    }

    /**
@ -233,7 +239,8 @@ public class SAMDataSource {
            BAQ.QualityMode qmode,
            IndexedFastaSequenceFile refReader,
            BaseRecalibration bqsrApplier,
-            byte defaultBaseQualities) {
+            byte defaultBaseQualities,
+            boolean removeProgramRecords) {
        this.readMetrics = new ReadMetrics();
        this.genomeLocParser = genomeLocParser;

@ -249,6 +256,7 @@ public class SAMDataSource {
            dispatcher = null;

        validationStringency = strictness;
+        this.removeProgramRecords = removeProgramRecords;
        if(readBufferSize != null)
            ReadShard.setReadBufferSize(readBufferSize);
        else {
@ -748,7 +756,7 @@ public class SAMDataSource {
        private synchronized void createNewResource() {
            if(allResources.size() > maxEntries)
                throw new ReviewedStingException("Cannot create a new resource pool.  All resources are in use.");
-            SAMReaders readers = new SAMReaders(readerIDs, validationStringency);
+            SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords);
            allResources.add(readers);
            availableResources.add(readers);
        }
@ -777,9 +785,11 @@ public class SAMDataSource {
        /**
         * Derive a new set of readers from the Reads metadata.
         * @param readerIDs reads to load.
+         * TODO: validationStringency is not used here
         * @param validationStringency validation stringency.
+         * @param removeProgramRecords indicate whether to clear program records from the readers
         */
-        public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
+        public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency, boolean removeProgramRecords) {
            final int totalNumberOfFiles = readerIDs.size();
            int readerNumber = 1;
            final SimpleTimer timer = new SimpleTimer().start();
@ -790,6 +800,9 @@ public class SAMDataSource {
            long lastTick = timer.currentTime();
            for(final SAMReaderID readerID: readerIDs) {
                final ReaderInitializer init = new ReaderInitializer(readerID).call();
+                if (removeProgramRecords) {
+                    init.reader.getFileHeader().setProgramRecords(new ArrayList<SAMProgramRecord>());
+                }
                if (threadAllocation.getNumIOThreads() > 0) {
                    inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
                }
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
@ -89,9 +89,9 @@ public class GATKReport {
            reader = new BufferedReader(new FileReader(file));
            reportHeader = reader.readLine();
        } catch (FileNotFoundException e) {
-            throw new ReviewedStingException("Could not open file : " + file);
+            throw new UserException.CouldNotReadInputFile(file, "it does not exist");
        } catch (IOException e) { 
-            throw new ReviewedStingException("Could not read file : " + file);                            
+            throw new UserException.CouldNotReadInputFile(file, e);
        }   


--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
@ -32,6 +32,7 @@ import java.util.List;
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50,maxRegion=1500)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
+@RemoveProgramRecords
 public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {

    @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java
@ -19,6 +19,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.LOCUS)
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
+@RemoveProgramRecords
 public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
    // Do we actually want to operate on the context?
    public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java
@ -0,0 +1,21 @@
+package org.broadinstitute.sting.gatk.walkers;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: thibault
+ * Date: 8/2/12
+ * Time: 1:58 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+import java.lang.annotation.*;
+
+/**
+ * Indicates that program records should be removed from SAM headers by default for this walker
+ */
+@Documented
+@Inherited
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface RemoveProgramRecords {
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@ -62,7 +62,7 @@ import java.util.ArrayList;
 * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
 * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
 * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
- * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
+ * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
 * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
 * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
 * <p>
@ -94,7 +94,7 @@ import java.util.ArrayList;
 * <h2>Examples</h2>
 * <pre>
 * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T BaseQualityScoreRecalibrator \
+ *   -T BaseRecalibrator \
 *   -I my_reads.bam \
 *   -R resources/Homo_sapiens_assembly18.fasta \
 *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
@ -288,7 +288,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
    private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
 //        for ( String sample : samplesToPhase )
 //            logger.debug(String.format("  Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
-        VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true);
+        VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
 //        logger.debug("original VC = " + vc);
 //        logger.debug("sub      VC = " + subvc);
        return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java
@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector {
            return true;
        // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
        // first subset to the samples
-        VariantContext subContext = vc.subContextFromSamples(samples, true);
+        VariantContext subContext = vc.subContextFromSamples(samples);

        // now check to see (using EXACT model) whether this should be variant
        // do we want to apply a prior? maybe user-spec?
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java
@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{
        if ( samples == null || samples.isEmpty() )
            return true;

-        VariantContext subContext =  vc.subContextFromSamples(samples, false);
+        VariantContext subContext =  vc.subContextFromSamples(samples);
        if ( subContext.isPolymorphicInSamples() ) {
            return true;
        }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
@ -500,7 +500,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu

    @Requires({"eval != null", "comp != null"})
    private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
-        // find all of the matching comps
+        if ( comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION )
+            // if either of these are NO_VARIATION they are LENIENT matches
+            return EvalCompMatchType.LENIENT;
+
        if ( comp.getType() != eval.getType() )
            return EvalCompMatchType.NO_MATCH;

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java
@ -57,9 +57,12 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
        }
    }

-    public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-        if (vc1 != null) updateTiTv(vc1, false);
-        if (vc2 != null) updateTiTv(vc2, true);
+    @Override
+    public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if (eval != null)
+            updateTiTv(eval, false);
+        if (comp != null)
+            updateTiTv(comp, true);
    }

    @Override
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java
@ -28,7 +28,7 @@ public class Novelty extends VariantStratifier implements StandardStratification
            final Collection<VariantContext> knownComps = tracker.getValues(knowns, ref.getLocus());
            for ( final VariantContext c : knownComps ) {
                // loop over sites, looking for something that matches the type eval
-                if ( eval.getType() == c.getType() ) {
+                if ( eval.getType() == c.getType() || eval.getType() == VariantContext.Type.NO_VARIATION ) {
                    return KNOWN_STATES;
                }
            }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
@ -197,7 +197,9 @@ public class VariantEvalUtils {
     * @return a new VariantContext with just the requested samples
     */
    public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
-        return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false));
+        // if we want to preserve AC0 sites as polymorphic we need to not rederive alleles
+        final boolean deriveAlleles = variantEvalWalker.ignoreAC0Sites();
+        return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, deriveAlleles));
    }

    public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) {
@ -262,12 +264,8 @@ public class VariantEvalUtils {
                // First, filter the VariantContext to represent only the samples for evaluation
                VariantContext vcsub = vc;

-                if (subsetBySample && vc.hasGenotypes()) {
-                    if ( variantEvalWalker.isSubsettingToSpecificSamples() )
-                        vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
-                    else
-                        vcsub = ensureAnnotations(vc, vc);
-                }
+                if (subsetBySample && vc.hasGenotypes())
+                    vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());

                if ((byFilter || !vcsub.isFiltered())) {
                    addMapping(mapping, VariantEval.getAllSampleName(), vcsub);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
@ -58,14 +58,11 @@ import java.util.*;
 * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
 * slightly lower quality level.
 *
- * <p>
- * See <a href="http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration">the GATK wiki for a tutorial and example recalibration accuracy plots.</a>
- *
 * <h2>Input</h2>
 * <p>
 * The input raw variants to be recalibrated.
 * <p>
- * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
+ * The recalibration table file in VCF format that was generated by the VariantRecalibrator walker.
 * <p>
 * The tranches file that was generated by the VariantRecalibrator walker.
 *
@ -82,6 +79,7 @@ import java.util.*;
 *   --ts_filter_level 99.0 \
 *   -tranchesFile path/to/output.tranches \
 *   -recalFile path/to/output.recal \
+ *   -mode SNP \
 *   -o path/to/output.recalibrated.filtered.vcf
 * </pre>
 *
@ -198,7 +196,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T

        for( final VariantContext vc : VCs ) {

-            if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
+            if( VariantDataManager.checkVariationClass( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {

                final VariantContext recalDatum = getMatchingRecalVC(vc, recals);
                if( recalDatum == null ) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
 import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
 import org.broadinstitute.sting.utils.exceptions.UserException;
@ -273,11 +274,37 @@ public class VariantDataManager {
    }

    private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
-        return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
-                        ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
+        return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) &&
                        (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples());
    }

+    protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) {
+        switch( trainVC.getType() ) {
+            case SNP:
+            case MNP:
+                return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.SNP );
+            case INDEL:
+            case MIXED:
+            case SYMBOLIC:
+                return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL );
+            default:
+                return false;
+        }
+    }
+
+    protected static boolean checkVariationClass( final VariantContext evalVC, final VariantRecalibratorArgumentCollection.Mode mode ) {
+        switch( mode ) {
+            case SNP:
+                return evalVC.isSNP() || evalVC.isMNP();
+            case INDEL:
+                return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic();
+            case BOTH:
+                return true;
+            default:
+                throw new ReviewedStingException( "Encountered unknown recal mode: " + mode );
+        }
+    }
+
    public void writeOutRecalibrationTable( final VariantContextWriter recalWriter ) {
        // we need to sort in coordinate order in order to produce a valid VCF
        Collections.sort( data, new Comparator<VariantDatum>() {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.R.RScriptExecutor;
 import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
 import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
@ -48,7 +47,6 @@ import org.broadinstitute.sting.utils.io.Resource;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
-import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;

 import java.io.File;
 import java.io.FileNotFoundException;
@ -65,7 +63,7 @@ import java.util.*;
 * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
 * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
 * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
- * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
+ * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the the probability that a SNP is a true genetic
 * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
 * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
 * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
@ -73,15 +71,9 @@ import java.util.*;
 * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
 *
 * <p>
- * NOTE: Please see our <a href="http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3">best practices wiki page</a> for our recommendations on which annotations to use for specific project designs.
- *
- * <p>
 * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
 * See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
 *
- * <p>
- * See <a href="http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration">the GATK wiki for a tutorial and example recalibration accuracy plots.</a>
- *
 * <h2>Input</h2>
 * <p>
 * The input raw variants to be recalibrated.
@ -90,7 +82,7 @@ import java.util.*;
 *
 * <h2>Output</h2>
 * <p>
- * A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
+ * A recalibration table file in VCF format that is used by the ApplyRecalibration walker.
 * <p>
 * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
 *
@ -102,8 +94,9 @@ import java.util.*;
 *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
 *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
 *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
- *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an MQ \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
+ *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
+ *   -mode SNP \
 *   -recalFile path/to/output.recal \
 *   -tranchesFile path/to/output.tranches \
 *   -rscriptFile path/to/output.plots.R
@ -187,9 +180,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    @Advanced
    @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
    protected Boolean TRUST_ALL_POLYMORPHIC = false;
-    //@Hidden
-    //@Argument(fullName = "projectConsensus", shortName = "projectConsensus", doc = "Perform 1000G project consensus. This implies an extra prior factor based on the individual participant callsets passed in with consensus=true rod binding tags.", required = false)
-    //protected Boolean PERFORM_PROJECT_CONSENSUS = false;

    /////////////////////////////
    // Private Member Variables
@ -255,7 +245,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

        for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
            if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) {
-                if( checkRecalibrationMode( vc, VRAC.MODE ) ) {
+                if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) {
                    final VariantDatum datum = new VariantDatum();

                    // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
@ -268,10 +258,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
                    // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
                    dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
                    double priorFactor = QualityUtils.qualToProb( datum.prior );
-                    //if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
-                    //    final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
-                    //    priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
-                    //}
                    datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );

                    mapList.add( datum );
@ -282,12 +268,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        return mapList;
    }

-    public static boolean checkRecalibrationMode( final VariantContext vc, final VariantRecalibratorArgumentCollection.Mode mode ) {
-        return mode == VariantRecalibratorArgumentCollection.Mode.BOTH ||
-                (mode == VariantRecalibratorArgumentCollection.Mode.SNP && vc.isSNP()) ||
-	            (mode == VariantRecalibratorArgumentCollection.Mode.INDEL && (vc.isIndel() || vc.isMixed()));
-    }
-
    //---------------------------------------------------------------------------------------------------------------
    //
    // reduce
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
@ -53,6 +53,11 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
    final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
    private final static boolean FORBID_SYMBOLICS = false;

+    private final static int ALLOWED_MAJOR_VERSION = 2;
+    private final static int MIN_MINOR_VERSION = 1;
+
+    private BCFVersion bcfVersion = null;
+
    private VCFHeader header = null;

    /**
@ -131,8 +136,16 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
    public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
        try {
            // note that this reads the magic as well, and so does double duty
-            if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
-                error("Input stream does not begin with BCF2 magic");
+            bcfVersion = BCFVersion.readBCFVersion(inputStream);
+            if ( bcfVersion == null )
+                error("Input stream does not contain a BCF encoded file; BCF magic header info not found");
+
+            if ( bcfVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION )
+                error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion());
+            if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION )
+                error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion());
+
+            logger.info("BCF version " + bcfVersion);

            final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);

@ -187,7 +200,8 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(path);
-            return BCF2Utils.startsWithBCF2Magic(fis);
+            final BCFVersion version = BCFVersion.readBCFVersion(fis);
+            return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION;
        } catch ( FileNotFoundException e ) {
            return false;
        } catch ( IOException e ) {
@ -196,7 +210,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
            try {
                if ( fis != null ) fis.close();
            } catch ( IOException e ) {
-                ; // do nothing
+                // do nothing
            }
        }
    }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
@ -202,7 +202,7 @@ public final class BCF2Decoder {
                return null;
            else {
                final String s = new String(bytes, 0, goodLength);
-                return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
+                return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s;
            }
        } catch ( IOException e ) {
            throw new ReviewedStingException("readByte failure", e);
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
@ -41,8 +41,6 @@ import java.util.*;
 * @since 5/12
 */
 public final class BCF2Utils {
-    public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
-
    public static final int MAX_ALLELES_IN_GENOTYPES = 127;

    public static final int OVERFLOW_ELEMENT_MARKER = 15;
@ -75,68 +73,60 @@ public final class BCF2Utils {
     */
    @Requires("header != null")
    @Ensures({"result != null", "new HashSet(result).size() == result.size()"})
-    public final static ArrayList<String> makeDictionary(final VCFHeader header) {
+    public static ArrayList<String> makeDictionary(final VCFHeader header) {
        final Set<String> seen = new HashSet<String>();
        final ArrayList<String> dict = new ArrayList<String>();

-        boolean sawPASS = false;
+        // special case the special PASS field which doesn't show up in the FILTER field definitions
+        seen.add(VCFConstants.PASSES_FILTERS_v4);
+        dict.add(VCFConstants.PASSES_FILTERS_v4);
+
        // set up the strings dictionary
        for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
            if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) {
                final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
                if ( ! seen.contains(idLine.getID())) {
-                    sawPASS = sawPASS || idLine.getID().equals(VCFConstants.PASSES_FILTERS_v4);
                    dict.add(idLine.getID());
                    seen.add(idLine.getID());
                }
            }
        }

-
-        if ( ! sawPASS )
-            dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
-
        return dict;
    }

    @Requires({"nElements >= 0", "type != null"})
-    public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
+    public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
        int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
        byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
        return typeByte;
    }

    @Ensures("result >= 0")
-    public final static int decodeSize(final byte typeDescriptor) {
+    public static int decodeSize(final byte typeDescriptor) {
        return (0xF0 & typeDescriptor) >> 4;
    }

    @Ensures("result >= 0")
-    public final static int decodeTypeID(final byte typeDescriptor) {
+    public static int decodeTypeID(final byte typeDescriptor) {
        return typeDescriptor & 0x0F;
    }

    @Ensures("result != null")
-    public final static BCF2Type decodeType(final byte typeDescriptor) {
+    public static BCF2Type decodeType(final byte typeDescriptor) {
        return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
    }

-    public final static boolean sizeIsOverflow(final byte typeDescriptor) {
+    public static boolean sizeIsOverflow(final byte typeDescriptor) {
        return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
    }

    @Requires("nElements >= 0")
-    public final static boolean willOverflow(final long nElements) {
+    public static boolean willOverflow(final long nElements) {
        return nElements > MAX_INLINE_ELEMENTS;
    }

-    public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException {
-        final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length];
-        stream.read(magicBytes);
-        return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE);
-    }
-
-    public final static byte readByte(final InputStream stream) {
+    public static byte readByte(final InputStream stream) {
        // TODO -- shouldn't be capturing error here
        try {
            return (byte)(stream.read() & 0xFF);
@ -155,7 +145,7 @@ public final class BCF2Utils {
     */
    @Requires({"strings != null", "strings.size() > 1"})
    @Ensures("result != null")
-    public static final String collapseStringList(final List<String> strings) {
+    public static String collapseStringList(final List<String> strings) {
        final StringBuilder b = new StringBuilder();
        for ( final String s : strings ) {
            if ( s != null ) {
@ -177,14 +167,14 @@ public final class BCF2Utils {
     */
    @Requires({"collapsed != null", "isCollapsedString(collapsed)"})
    @Ensures("result != null")
-    public static final List<String> exploreStringList(final String collapsed) {
+    public static List<String> explodeStringList(final String collapsed) {
        assert isCollapsedString(collapsed);
        final String[] exploded = collapsed.substring(1).split(",");
        return Arrays.asList(exploded);
    }

    @Requires("s != null")
-    public static final boolean isCollapsedString(final String s) {
+    public static boolean isCollapsedString(final String s) {
        return s.charAt(0) == ',';
    }

@ -226,7 +216,7 @@ public final class BCF2Utils {
    }

    @Ensures("result.isIntegerType()")
-    public final static BCF2Type determineIntegerType(final int value) {
+    public static BCF2Type determineIntegerType(final int value) {
        for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
            if ( potentialType.withinRange(value) )
                return potentialType;
@ -236,7 +226,7 @@ public final class BCF2Utils {
    }

    @Ensures("result.isIntegerType()")
-    public final static BCF2Type determineIntegerType(final int[] values) {
+    public static BCF2Type determineIntegerType(final int[] values) {
        // literally a copy of the code below, but there's no general way to unify lists and arrays in java
        BCF2Type maxType = BCF2Type.INT8;
        for ( final int value : values ) {
@ -262,7 +252,7 @@ public final class BCF2Utils {
     */
    @Requires({"t1.isIntegerType()","t2.isIntegerType()"})
    @Ensures("result.isIntegerType()")
-    public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
+    public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
        switch ( t1 ) {
            case INT8: return t2;
            case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
@ -272,7 +262,7 @@ public final class BCF2Utils {
    }

    @Ensures("result.isIntegerType()")
-    public final static BCF2Type determineIntegerType(final List<Integer> values) {
+    public static BCF2Type determineIntegerType(final List<Integer> values) {
        BCF2Type maxType = BCF2Type.INT8;
        for ( final int value : values ) {
            final BCF2Type type1 = determineIntegerType(value);
@ -297,7 +287,7 @@ public final class BCF2Utils {
     * @param o
     * @return
     */
-    public final static List<Object> toList(final Object o) {
+    public static List<Object> toList(final Object o) {
        if ( o == null ) return Collections.emptyList();
        else if ( o instanceof List ) return (List<Object>)o;
        else return Collections.singletonList(o);
@ -305,7 +295,7 @@ public final class BCF2Utils {


    @Requires({"stream != null", "bytesForEachInt > 0"})
-    public final static int readInt(int bytesForEachInt, final InputStream stream) {
+    public static int readInt(int bytesForEachInt, final InputStream stream) {
        switch ( bytesForEachInt ) {
            case 1: {
                return (byte)(readByte(stream));
@ -323,7 +313,7 @@ public final class BCF2Utils {
        }
    }

-    public final static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
+    public static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
        switch ( type.getSizeInBytes() ) {
            case 1:
                encodeStream.write(0xFF & value);
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java
@ -0,0 +1,80 @@
+package org.broadinstitute.sting.utils.codecs.bcf2;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Arrays;
+
+/**
+ * Simple holder for BCF version information
+ *
+ * User: depristo
+ * Date: 8/2/12
+ * Time: 2:16 PM
+ */
+public class BCFVersion {
+    /**
+     * BCF2 begins with the MAGIC info BCF_M_m where M is the major version (currently 2)
+     * and m is the minor version, currently 1
+     */
+    public static final byte[] MAGIC_HEADER_START = "BCF".getBytes();
+
+    final int majorVersion;
+    final int minorVersion;
+
+    public BCFVersion(int majorVersion, int minorVersion) {
+        this.majorVersion = majorVersion;
+        this.minorVersion = minorVersion;
+    }
+
+    /**
+     * @return the major version number of this BCF file
+     */
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    /**
+     * @return the minor version number of this BCF file
+     */
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    /**
+     * Return a new BCFVersion object describing the major and minor version of the BCF file in stream
+     *
+     * Note that stream must be at the very start of the file.
+     *
+     * @param stream
+     * @return a BCFVersion object, or null if stream doesn't contain a BCF file
+     * @throws IOException
+     */
+    public static BCFVersion readBCFVersion(final InputStream stream) throws IOException {
+        final byte[] magicBytes = new byte[MAGIC_HEADER_START.length];
+        stream.read(magicBytes);
+        if ( Arrays.equals(magicBytes, MAGIC_HEADER_START) ) {
+            // we're a BCF file
+            final int majorByte = stream.read();
+            final int minorByte = stream.read();
+            return new BCFVersion( majorByte, minorByte );
+        } else
+            return null;
+    }
+
+    /**
+     * Write out the BCF magic information indicating this is a BCF file with corresponding major and minor versions
+     * @param out
+     * @throws IOException
+     */
+    public void write(final OutputStream out) throws IOException {
+        out.write(MAGIC_HEADER_START);
+        out.write(getMajorVersion() & 0xFF);
+        out.write(getMinorVersion() & 0xFF);
+    }
+
+    @Override
+    public String toString() {
+        return String.format("BCF%d.%d", getMajorVersion(), getMinorVersion());
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -334,12 +334,14 @@ public class VariantContext implements Feature { // to enable tribble integratio
     * in this VC is returned as the set of alleles in the subContext, even if
     * some of those alleles aren't in the samples
     *
+     * WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING?
+     *
     * @param sampleNames    the sample names
-     * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples
+     * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default
     * @return new VariantContext subsetting to just the given samples
     */
    public VariantContext subContextFromSamples(Set<String> sampleNames, final boolean rederiveAllelesFromGenotypes ) {
-        if ( sampleNames.containsAll(getSampleNames()) ) {
+        if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) {
            return this; // fast path when you don't have any work to do
        } else {
            VariantContextBuilder builder = new VariantContextBuilder(this);
@ -355,8 +357,18 @@ public class VariantContext implements Feature { // to enable tribble integratio
        }
    }

+    /**
+     * @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true
+     *
+     * @param sampleNames
+     * @return
+     */
+    public VariantContext subContextFromSamples(final Set<String> sampleNames) {
+        return subContextFromSamples(sampleNames, true);
+    }
+
    public VariantContext subContextFromSample(String sampleName) {
-        return subContextFromSamples(Collections.singleton(sampleName), true);
+        return subContextFromSamples(Collections.singleton(sampleName));
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
@ -31,6 +31,7 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
 import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
 import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
+import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
@ -83,6 +84,9 @@ import java.util.*;
 * @since 06/12
 */
 class BCF2Writer extends IndexingVariantContextWriter {
+    public static final int MAJOR_VERSION = 2;
+    public static final int MINOR_VERSION = 1;
+
    /**
     * If true, we will write out the undecoded raw bytes for a genotypes block, if it
     * is found in the input VC.  This can be very dangerous as the genotype encoding
@ -153,7 +157,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
            writer.close();

            final byte[] headerBytes = capture.toByteArray();
-            outputStream.write(BCF2Utils.MAGIC_HEADER_LINE);
+            new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream);
            BCF2Utils.encodeRawBytes(headerBytes.length, BCF2Type.INT32, outputStream);
            outputStream.write(headerBytes);
        } catch (IOException e) {
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java
@ -24,9 +24,12 @@

 package org.broadinstitute.sting.gatk.datasources.reads;

+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
 import static org.testng.Assert.fail;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMProgramRecord;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.commandline.Tags;
@ -36,6 +39,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
 import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.testng.annotations.AfterMethod;
@ -143,4 +147,73 @@ public class SAMDataSourceUnitTest extends BaseTest {
            fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception");
        }
    }
+
+    /** Test that we clear program records when requested */
+    @Test
+    public void testRemoveProgramRecords() {
+        logger.warn("Executing testRemoveProgramRecords");
+
+        // setup the data
+        readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags()));
+
+        // use defaults
+        SAMDataSource data = new SAMDataSource(readers,
+                new ThreadAllocation(),
+                null,
+                genomeLocParser,
+                false,
+                SAMFileReader.ValidationStringency.SILENT,
+                null,
+                null,
+                new ValidationExclusion(),
+                new ArrayList<ReadFilter>(),
+                false);
+
+        List<SAMProgramRecord> defaultProgramRecords = data.getHeader().getProgramRecords();
+        assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor");
+
+        boolean removeProgramRecords = false;
+        data = new SAMDataSource(readers,
+                new ThreadAllocation(),
+                null,
+                genomeLocParser,
+                false,
+                SAMFileReader.ValidationStringency.SILENT,
+                null,
+                null,
+                new ValidationExclusion(),
+                new ArrayList<ReadFilter>(),
+                false,
+                BAQ.CalculationMode.OFF,
+                BAQ.QualityMode.DONT_MODIFY,
+                null, // no BAQ
+                null, // no BQSR
+                (byte) -1,
+                removeProgramRecords);
+
+        List<SAMProgramRecord> dontRemoveProgramRecords = data.getHeader().getProgramRecords();
+        assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false");
+
+        removeProgramRecords = true;
+        data = new SAMDataSource(readers,
+                new ThreadAllocation(),
+                null,
+                genomeLocParser,
+                false,
+                SAMFileReader.ValidationStringency.SILENT,
+                null,
+                null,
+                new ValidationExclusion(),
+                new ArrayList<ReadFilter>(),
+                false,
+                BAQ.CalculationMode.OFF,
+                BAQ.QualityMode.DONT_MODIFY,
+                null, // no BAQ
+                null, // no BQSR
+                (byte) -1,
+                removeProgramRecords);
+
+        List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords();
+        assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true");
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -34,7 +34,7 @@ import java.util.Arrays;
 import java.util.List;

 public class VariantEvalIntegrationTest extends WalkerTest {
-    private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval/";
+    private static String variantEvalTestDataRoot = privateTestDir + "VariantEval/";
    private static String fundamentalTestVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
    private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.vcf";
    private static String fundamentalTestSNPsWithMLEVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.withMLE.vcf";
@ -122,7 +122,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                        "-o %s"
                ),
                1,
-                Arrays.asList("e62a3bd9914d48e2bb2fb4f5dfc5ebc0")
+                Arrays.asList("40abbc9be663aed8ee1158f832463ca8")
        );
        executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec);
    }
@ -144,7 +144,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                        "-o %s"
                ),
                1,
-                Arrays.asList("087a2d9943c53e7f49663667c3305c7e")
+                Arrays.asList("106a0e8753e839c0a2c030eb4b165fa9")
        );
        executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec);
    }
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -26,9 +26,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    }

    VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
-            "62f81e7d2082fbc71cae0101c27fefad",  // tranches
-            "b9709e4180e56abc691b208bd3e8626c",  // recal file
-            "75c178345f70ca2eb90205662fbdf968"); // cut VCF
+            "f360ce3eb2b0b887301be917a9843e2b",  // tranches
+            "287fea5ea066bf3fdd71f5ce9b58eab3",  // recal file
+            "356b9570817b9389da71fbe991d8b2f5"); // cut VCF

    @DataProvider(name = "VRTest")
    public Object[][] createData1() {
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java
@ -351,7 +351,7 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
    public void testEncodingListOfString(List<String> strings, String expected) throws IOException {
        final String collapsed = BCF2Utils.collapseStringList(strings);
        Assert.assertEquals(collapsed, expected);
-        Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings);
+        Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings);
    }

    // -----------------------------------------------------------------
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java
@ -57,7 +57,7 @@ public class VCFIntegrationTest extends WalkerTest {
        String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s ";

        String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
-        WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(""));
+        WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("bdab26dd7648a806dbab01f64db2bdab"));
        executeTest("Test reading and writing 1000G Phase I SVs", spec1);
    }

@ -77,7 +77,7 @@ public class VCFIntegrationTest extends WalkerTest {
        String testVCF = privateTestDir + "ex2.vcf";
        String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s ";
        String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
-        WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("a04a0fc22fedb516c663e56e51fc1e27"));
+        WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e8f721ce81e4fdadba13c5291027057f"));
        executeTest("Test writing samtools WEx BCF example", spec1);
    }

--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java
@ -152,7 +152,7 @@ public class VariantContextBenchmark extends SimpleBenchmark {
                    public void run(final VariantContext vc) {
                        if ( samples == null )
                            samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
-                        VariantContext sub = vc.subContextFromSamples(samples, true);
+                        VariantContext sub = vc.subContextFromSamples(samples);
                        sub.getNSamples();
                    }
                };
--- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala
+++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala
@ -41,7 +41,7 @@ class DataProcessingPipelineTest {
      " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf",
      " -test ",
      " -p " + projectName).mkString
-    spec.fileMD5s += testOut -> "021db3a52ee40d9193895213aaa05dd5"
+    spec.fileMD5s += testOut -> "60d39ae909fdd049920b54e0965b6d3c"
    PipelineTest.executeTest(spec)
  }

@ -60,7 +60,7 @@ class DataProcessingPipelineTest {
      " -bwa /home/unix/carneiro/bin/bwa",
      " -bwape ",
      " -p " + projectName).mkString
-    spec.fileMD5s += testOut -> "94e32ee974aa15d05ee7d3e99f2f8c8f"
+    spec.fileMD5s += testOut -> "61ca3237afdfabf78ee27a5bb80dae59"
    PipelineTest.executeTest(spec)
  }

--- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala
+++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala
@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest {
      " -blasr ",
      " -test ",
      " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf").mkString
-    spec.fileMD5s += testOut -> "8b5f9e8e1ba5bd686e3084eaeea23889"
+    spec.fileMD5s += testOut -> "61b06e8b78a93e6644657e6d38851084"
    PipelineTest.executeTest(spec)
  }
 }