Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-08-18 09:32:57 -04:00 · 2011-08-18 09:32:57 -04:00 · 4da42d9f39
parent 2e35592295 de35bb2e4d
commit 4da42d9f39
7 changed files with 540 additions and 21 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@ -64,11 +64,11 @@ import static java.lang.Math.log10;
 * <pre>
 *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
 *      -R reffile.fasta -T BeagleOutputToVCF \
- *      -B:variant,VCF input_vcf.vcf \
- *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
- *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
- *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
- *      --out output_vcf.vcf
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
 *      </pre>

 <p> Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them </p>
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@ -68,7 +68,7 @@ import java.util.*;
 * <pre>
 *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
 *      -R reffile.fasta -T ProduceBeagleInput \
- *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
 * </pre>
 *
 */
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.validation;
+
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
+
+import java.util.Map;
+import java.util.Set;
+
+import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
+
+/**
+ * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper.
+ *
+ *  <p>
+ *     Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs
+ *     and Indels given a secondary (validation) data source. The data sources are BAM or VCF
+ *     files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF
+ *     to validate calls on a BAM).
+ *  </p>
+ *
+ *  <p>
+ *     The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you
+ *     want to know how well a particular technology performs calling these snps. With a
+ *     dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you
+ *     can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's
+ *     dataset.
+ *  </p>
+ *
+ *  <p>
+ *     Another option is to validate the calls on a VCF file, using a deep coverage BAM file
+ *     that you trust the calls on. The GenotypeAndValidate walker will make calls using the
+ *     reads in the BAM file and take them as truth, then compare to the calls in the VCF file
+ *     and produce a truth table.
+ *  </p>
+ *
+ *
+ * <h2>Input</h2>
+ *  <p>
+ *      A BAM file to make calls on and a VCF file to use as truth validation dataset.
+ *
+ *      You also have the option to invert the roles of the files using the command line options listed below.
+ *  </p>
+ *
+ * <h2>Output</h2>
+ *  <p>
+ *      GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a
+ *      2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true
+ *      positive or a false positive). The table should look like this:
+ *  </p>
+ * <center>
+ * <table id="description-table">
+ *     <tr>
+ *         <th></th>
+ *         <th>ALT</th>
+ *         <th>REF</th>
+ *         <th>Predictive Value</th>
+ *     </tr>
+ *     <tr>
+ *         <td><b>called alt</b></td>
+ *         <td>True Positive (TP)</td>
+ *         <td>False Positive (FP)</td>
+ *         <td>Positive PV</td>
+ *     </tr>
+ *     <tr>
+ *         <td><b>called ref</b></td>
+ *         <td>False Negative (FN)</td>
+ *         <td>True Negative (TN)</td>
+ *         <td>Negative PV</td>
+ *     </tr>
+ * </table>
+ * </center>
+ *
+ *  <p>
+ *      The <b>positive predictive value (PPV)</b> is the proportion of subjects with positive test results
+ *      who are correctly diagnosed.
+ *  </p>
+ *  <p>
+ *      The <b>negative predictive value (NPV)</b> is the proportion of subjects with a negative test result
+ *      who are correctly diagnosed.
+ *  </p>
+ *  <p>
+ *      The VCF file will contain only the variants that were called or not called, excluding the ones that
+ *      were uncovered or didn't pass the filters. This file is useful if you are trying to compare
+ *      the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to
+ *      apples).
+ *  </p>
+ *
+ *  <p>
+ *      Here is an example of an annotated VCF file (info field clipped for clarity)
+ *
+ * <pre>
+ * #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+ * 1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+ * 1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+ * 13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+ * 1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+ * </pre>
+ *
+ *  </p>
+ *
+ *  <h3>Additional Details</h3>
+ *  <ul>
+ *      <li>
+ *          You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file.
+ *          This speeds up the process a lot.
+ *      </li>
+ *      <li>
+ *          The total number of visited bases may be greater than the number of variants in the original
+ *          VCF file because of extended indels, as they trigger one call per new insertion or deletion.
+ *          (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF).
+ *      </li>
+ *  </ul>
+ *
+ * <h2>Examples</h2>
+ * <ol>
+ *     <li>
+ *         Genotypes BAM file from new technology using the VCF as a truth dataset:
+ *     </li>
+ *
+ * <pre>
+ *  java
+ *      -jar /GenomeAnalysisTK.jar
+ *      -T  GenotypeAndValidate
+ *      -R human_g1k_v37.fasta
+ *      -I myNewTechReads.bam
+ *      -alleles handAnnotatedVCF.vcf
+ *      -BTI alleles
+ * </pre>
+ *
+ *      <li>
+ *          Using a BAM file as the truth dataset:
+ *      </li>
+ *
+ * <pre>
+ *  java
+ *      -jar /GenomeAnalysisTK.jar
+ *      -T  GenotypeAndValidate
+ *      -R human_g1k_v37.fasta
+ *      -I myTruthDataset.bam
+ *      -alleles callsToValidate.vcf
+ *      -BTI alleles
+ *      -bt
+ *      -o gav.vcf
+ * </pre>
+ *
+ *
+ * @author Mauricio Carneiro
+ * @since ${DATE}
+ */
+
+@Requires(value={DataSource.READS, DataSource.REFERENCE})
+@Allows(value={DataSource.READS, DataSource.REFERENCE})
+
+@By(DataSource.REFERENCE)
+@Reference(window=@Window(start=-200,stop=200))
+
+
+public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> implements TreeReducible<GenotypeAndValidateWalker.CountedData> {
+
+    @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false)
+    protected VCFWriter vcfWriter = null;
+
+    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true)
+    public RodBinding<VariantContext> alleles;
+
+    @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false)
+    private boolean bamIsTruth = false;
+
+    @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
+    private int mbq = -1;
+
+    @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
+    private double deletions = -1;
+
+    @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false)
+    private double callConf = -1;
+
+    @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false)
+    private double emitConf = -1;
+
+    @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
+    private int minDepth = -1;
+
+    @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false)
+    private String sample = "";
+
+    private UnifiedGenotyperEngine snpEngine;
+    private UnifiedGenotyperEngine indelEngine;
+
+    public static class CountedData {
+        private long nAltCalledAlt = 0L;
+        private long nAltCalledRef = 0L;
+        private long nRefCalledAlt = 0L;
+        private long nRefCalledRef = 0L;
+        private long nNotConfidentCalls = 0L;
+        private long nUncovered = 0L;
+
+        /**
+         * Adds the values of other to this, returning this
+         * @param other the other object
+         */
+        public void add(CountedData other) {
+            nAltCalledAlt += other.nAltCalledAlt;
+            nAltCalledRef += other.nAltCalledRef;
+            nRefCalledAlt += other.nRefCalledAlt;
+            nRefCalledRef += other.nRefCalledRef;
+            nUncovered += other.nUncovered;
+            nNotConfidentCalls += other.nNotConfidentCalls;
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+
+        // Initialize VCF header
+        if (vcfWriter != null) {
+            Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
+            Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+            Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
+            headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
+            vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
+        }
+
+        // Filling in SNP calling arguments for UG
+        UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
+        uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
+        uac.alleles = alleles;
+        if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
+        if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
+        if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
+        if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
+        if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
+
+        uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
+        snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+        // Adding the INDEL calling arguments for UG
+        uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL;
+        indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+        // make sure we have callConf set to the threshold set by the UAC so we can use it later.
+        callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING;
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+
+        final CountedData counter = new CountedData();
+
+        // For some reason RodWalkers get map calls with null trackers
+        if( tracker == null )
+            return counter;
+
+        VariantContext vcComp = tracker.getFirstValue(alleles);
+        if( vcComp == null )
+            return counter;
+
+        //todo - not sure I want this, may be misleading to filter extended indel events.
+        if (isInsideExtendedIndel(vcComp,  ref))
+            return counter;
+
+        // Do not operate on variants that are not covered to the optional minimum depth
+        if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
+            counter.nUncovered = 1L;
+            return counter;
+        }
+
+        VariantCallContext call;
+        if ( vcComp.isSNP() )
+            call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+        else if ( vcComp.isIndel() ) {
+            call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+        }
+        else {
+            logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
+            return counter;
+        }
+
+
+        boolean writeVariant = true;
+
+        if (bamIsTruth) {
+            if (call.confidentlyCalled) {
+                // If truth is a confident REF call
+                if (call.isVariant()) {
+                    if (vcComp.isVariant())
+                        counter.nAltCalledAlt = 1L;  // todo -- may wanna check if the alts called are the same?
+                    else
+                        counter.nAltCalledRef = 1L;
+                }
+                // If truth is a confident ALT call
+                else {
+                    if (vcComp.isVariant())
+                        counter.nRefCalledAlt = 1L;
+                    else
+                        counter.nRefCalledRef = 1L;
+                }
+            }
+            else {
+                counter.nNotConfidentCalls = 1L;
+                writeVariant = false;
+            }
+        }
+        else {
+            if (!vcComp.hasAttribute("GV"))
+                throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
+
+
+
+            if (call.isCalledAlt(callConf)) {
+                if (vcComp.getAttribute("GV").equals("T"))
+                    counter.nAltCalledAlt = 1L;
+                else
+                    counter.nRefCalledAlt = 1L;
+            }
+            else if (call.isCalledRef(callConf)) {
+                if (vcComp.getAttribute("GV").equals("T"))
+                    counter.nAltCalledRef = 1L;
+                else
+                    counter.nRefCalledRef = 1L;
+            }
+            else {
+                counter.nNotConfidentCalls = 1L;
+                writeVariant = false;
+            }
+        }
+
+        if (vcfWriter != null && writeVariant) {
+            if (!vcComp.hasAttribute("callStatus")) {
+                MutableVariantContext mvc = new MutableVariantContext(vcComp);
+                mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
+                vcfWriter.add(mvc);
+            }
+            else
+                vcfWriter.add(vcComp);
+        }
+        return counter;
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public CountedData reduceInit() {
+        return new CountedData();
+    }
+
+    public CountedData treeReduce( final CountedData sum1, final CountedData sum2) {
+        sum2.add(sum1);
+        return sum2;
+    }
+
+    public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) {
+        reduceSum.add(mapValue);
+        return reduceSum;
+    }
+
+    public void onTraversalDone( CountedData reduceSum ) {
+        double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt));
+        double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef));
+        double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef));
+        double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100;
+        logger.info(String.format("Resulting Truth Table Output\n\n" +
+                                  "---------------------------------------------------\n" +
+                                  "\t\t|\tALT\t|\tREF\t\n"  +
+                                  "---------------------------------------------------\n" +
+                                  "called alt\t|\t%d\t|\t%d\n" +
+                                  "called ref\t|\t%d\t|\t%d\n" +
+                                  "---------------------------------------------------\n" +
+                                  "positive predictive value: %f%%\n" +
+                                  "negative predictive value: %f%%\n" +
+                                  "---------------------------------------------------\n" +
+                                  "sensitivity: %f%%\n" +
+                                  "specificity: %f%%\n" +
+                                  "---------------------------------------------------\n" +
+                                  "not confident: %d\n" +
+                                  "not covered: %d\n" +
+                                  "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered));
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@ -191,16 +191,28 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    protected VCFWriter vcfWriter = null;

    @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
-    public Set<String> sampleNames;
+    public Set<String> sampleNames = new HashSet<String>(0);

    @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
-    public Set<String> sampleExpressions;
+    public Set<String> sampleExpressions ;

    @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
    public Set<File> sampleFiles;

    /**
-     * Note that thse expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
+     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
+     */
+    @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false)
+    public Set<String> XLsampleNames = new HashSet<String>(0);
+
+    /**
+     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
+     */
+    @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
+    public Set<File> XLsampleFiles;
+
+    /**
+     * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
     */
    @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
    public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@ -304,8 +316,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    private ArrayList<Double> afBoosts = null;
    double bkDelta = 0.0;

-
-        private PrintStream outMVFileStream = null;
+    private PrintStream outMVFileStream = null;


    /**
@ -321,19 +332,27 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
        Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
        Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);

+        // first, add any requested samples
        samples.addAll(samplesFromFile);
        samples.addAll(samplesFromExpressions);
-        if (sampleNames != null)
-            samples.addAll(sampleNames);
+        samples.addAll(sampleNames);

-        if(samples.isEmpty()) {
+        // if none were requested, we want all of them
+        if ( samples.isEmpty() ) {
            samples.addAll(vcfSamples);
            NO_SAMPLES_SPECIFIED = true;
        }

-        for (String sample : samples) {
+        // now, exclude any requested samples
+        Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
+        samples.removeAll(XLsamplesFromFile);
+        samples.removeAll(XLsampleNames);
+
+        if ( samples.size() == 0 )
+            throw new UserException("All samples requested to be included were also requested to be excluded.");
+
+        for ( String sample : samples )
            logger.info("Including sample '" + sample + "'");
-        }

        // Initialize VCF header
        Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -1080,8 +1080,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati
    }

    public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
-        // don't validate if we're an insertion
-        if ( !reference.isNull() && !reference.basesMatch(getReference()) ) {
+        // don't validate if we're an insertion or complex event
+        if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
            throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
        }

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@ -7,7 +7,7 @@ import java.util.Arrays;

 public class SelectVariantsIntegrationTest extends WalkerTest {
    public static String baseTestString(String args) {
-        return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s" + args;
+        return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s -NO_HEADER" + args;
    }

    @Test
@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";

        WalkerTestSpec spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile + " -NO_HEADER"),
+            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile),
            1,
            Arrays.asList("d18516c1963802e92cb9e425c0b75fd6")
        );
@ -24,12 +24,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        executeTest("testComplexSelection--" + testfile, spec);
    }

+    @Test
+    public void testSampleExclusion() {
+        String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
+        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+            "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile,
+            1,
+            Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd")
+        );
+
+        executeTest("testSampleExclusion--" + testfile, spec);
+    }
+
    @Test
    public void testRepeatedLineSelection() {
        String testfile = validationDataLocation + "test.dup.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile + " -NO_HEADER"),
+                baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile),
                1,
                Arrays.asList("b74038779fe6485dbb8734ae48178356")
        );
--- a/settings/helpTemplates/style.css
+++ b/settings/helpTemplates/style.css
@ -95,6 +95,12 @@ dd {
    padding: 0 0 0.5em 0;
 }

+pre {
+    border: thin solid lightgray;
+    margin-left: 1em;
+    margin-right: 4em;
+    background-color: #e0fdff;
+}
 /*
 * clean table layouts
 */
@ -128,6 +134,48 @@ dd {
 }

 th#row-divider 
+{
+    font-weight: bolder;
+	font-size: larger;
+}
+
+
+/*
+ *  Table design for input/ouptut description
+ */
+
+#description-table
+{
+	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
+	font-size: 12px;
+	background: #fff;
+	margin: 5px;
+	border-collapse: collapse;
+	text-align: left;
+}
+#description-table th
+{
+	font-size: 16px;
+	font-weight: bold;
+    background-color: lightgray;
+	color: #039;
+    text-align: center;
+	padding: 10px 8px;
+	border-bottom: 2px solid #6678b1;
+}
+#description-table td
+{
+	border-bottom: 1px solid #ccc;
+	color: #669;
+	padding: 6px 8px;
+    text-align: right;
+}
+#description-table tbody tr:hover td
+{
+	color: #009;
+}
+
+th#row-divider
 {
    font-weight: bolder;
 	font-size: larger;