Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-07-14 19:18:05 -04:00 · 2011-07-14 19:18:05 -04:00 · 10cf9245d7
parent 1e81d521c0 9283ac906e
commit 10cf9245d7
20 changed files with 268 additions and 178 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java
@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine;

 import org.broad.tribble.readers.AsciiLineReader;
 import org.broad.tribble.readers.LineReader;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

 import java.io.*;
-import java.util.Arrays;
 import java.util.Map;
-import java.util.zip.GZIPInputStream;


 /**
@ -58,7 +54,13 @@ public class VCFDiffableReader implements DiffableReader {
            VCFCodec vcfCodec = new VCFCodec();

            // must be read as state is stored in reader itself
-            vcfCodec.readHeader(lineReader);
+            VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
+            for ( VCFHeaderLine headerLine : header.getMetaData() ) {
+                String key = headerLine.getKey();
+                if ( headerLine instanceof VCFNamedHeaderLine )
+                    key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
+                root.add(key, headerLine.toString());
+            }

            String line = lineReader.readLine();
            int count = 0;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@ -220,6 +220,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
            }
            else {
                unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly
+
+                if (DEBUG)
+                    logger.debug("Unprocessed variant = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
            }

            int numReads = 0;
@ -1105,7 +1108,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
            this.alleles = vc.getAlleles();
            this.genotypes = new HashMap<String, Genotype>(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable
            this.negLog10PError = vc.getNegLog10PError();
-            this.filters = vc.getFilters();
+            this.filters = vc.filtersWereApplied() ? vc.getFilters() : null;
            this.attributes = new HashMap<String, Object>(vc.getAttributes());
        }

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
@ -199,8 +199,8 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
            // setup the header fields
            Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
            hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
-            hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
-            hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
+            //hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
+            //hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));

            allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
            for ( VCFHeaderLine field : hInfo ) {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -7,6 +7,8 @@ import org.broad.tribble.NameAwareCodec;
 import org.broad.tribble.TribbleException;
 import org.broad.tribble.readers.LineReader;
 import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -96,6 +98,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
        for ( String str : headerStrings ) {
            if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
                String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR);
+                if ( strings.length < VCFHeader.HEADER_FIELDS.values().length )
+                    throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str);
+
                int arrayIndex = 0;
                for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
                    try {
@ -159,12 +164,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
    }

    private Feature reallyDecode(String line) {
-        try {
            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;

            // our header cannot be null, we need the genotype sample names and counts
-            if (header == null) throw new IllegalStateException("VCF Header cannot be null when decoding a record");
+            if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");

            if (parts == null)
                parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
@ -174,17 +178,18 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
            if (( (header == null || (header != null && !header.hasGenotypingData())) && nParts != NUM_STANDARD_FIELDS) ||
                 (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
-                throw new IllegalArgumentException("There aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
-                        " tokens, and saw " + nParts + " )");
+                throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
+                        " tokens, and saw " + nParts + " )", lineNo);

            return parseVCFLine(parts);
-        } catch (TribbleException e) {
-            throw new TribbleException.InvalidDecodeLine(e.getMessage(), line);
-        }
    }

    protected void generateException(String message) {
-        throw new TribbleException.InvalidDecodeLine(message, lineNo);
+        throw new UserException.MalformedVCF(message, lineNo);
+    }
+
+    private static void generateException(String message, int lineNo) {
+        throw new UserException.MalformedVCF(message, lineNo);
    }

    /**
@ -472,10 +477,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
        return true;
    }

-    private static void generateException(String message, int lineNo) {
-        throw new TribbleException.InvalidDecodeLine(message, lineNo);
-    }
-
    private static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
        boolean clipping = true;
        // Note that the computation of forward clipping here is meant only to see whether there is a common
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory;
 import org.broad.tribble.util.LittleEndianOutputStream;
 import org.broad.tribble.util.ParsingUtils;
 import org.broad.tribble.util.PositionalStream;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter {
            } else {
                List<String> genotypeAttributeKeys = new ArrayList<String>();
                if ( vc.hasGenotypes() ) {
-                    genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
-                    for ( String key : calcVCFGenotypeKeys(vc) ) {
-                        genotypeAttributeKeys.add(key);
-                    }
+                    genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
                } else if ( mHeader.hasGenotypingData() ) {
                    // this needs to be done in case all samples are no-calls
                    genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter {
                continue;
            }

-            writeAllele(g.getAllele(0), alleleMap);
-            for (int i = 1; i < g.getPloidy(); i++) {
-                mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
-                writeAllele(g.getAllele(i), alleleMap);
-            }
-
            List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
            for ( String key : genotypeFormatKeys ) {
-                if ( key.equals(VCFConstants.GENOTYPE_KEY) )
+
+                if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
+                    if ( !g.isAvailable() ) {
+                        throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
+                    }
+
+                    writeAllele(g.getAllele(0), alleleMap);
+                    for (int i = 1; i < g.getPloidy(); i++) {
+                        mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
+                        writeAllele(g.getAllele(i), alleleMap);
+                    }
+
                    continue;
+                }

                Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;

@ -440,9 +444,10 @@ public class StandardVCFWriter implements VCFWriter {
                    break;
            }

-            for (String s : attrs ) {
-                mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
-                mWriter.write(s);
+            for (int i = 0; i < attrs.size(); i++) {
+                if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) )
+                    mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
+                mWriter.write(attrs.get(i));
            }
        }
    }
@ -488,10 +493,13 @@ public class StandardVCFWriter implements VCFWriter {
    private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
        Set<String> keys = new HashSet<String>();

+        boolean sawGoodGT = false;
        boolean sawGoodQual = false;
        boolean sawGenotypeFilter = false;
        for ( Genotype g : vc.getGenotypes().values() ) {
            keys.addAll(g.getAttributes().keySet());
+            if ( g.isAvailable() )
+                sawGoodGT = true;
            if ( g.hasNegLog10PError() )
                sawGoodQual = true;
            if (g.isFiltered() && g.isCalled())
@ -504,7 +512,17 @@ public class StandardVCFWriter implements VCFWriter {
        if (sawGenotypeFilter)
            keys.add(VCFConstants.GENOTYPE_FILTER_KEY);

-        return ParsingUtils.sortList(new ArrayList<String>(keys));
+        List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
+
+        // make sure the GT is first
+        if ( sawGoodGT ) {
+            List<String> newList = new ArrayList<String>(sortedList.size()+1);
+            newList.add(VCFConstants.GENOTYPE_KEY);
+            newList.addAll(sortedList);
+            sortedList = newList;
+        }
+
+        return sortedList;
    }


--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec {
                    boolean missing = i >= GTValueSplitSize;

                    if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
-                        if (i != 0)
-                            generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
                        genotypeAlleleLocation = i;
                    } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
                        GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec {
                }
            }

-            // check to make sure we found a gentoype field
-            if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
+            // check to make sure we found a genotype field
+            if ( genotypeAlleleLocation < 0 )
+                generateException("Unable to find the GT field for the record; the GT field is required");
+            if ( genotypeAlleleLocation > 0 )
+                generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");

-            // todo -- assuming allele list length in the single digits is bad.  Fix me.
-            // Check for > 1 for haploid genotypes
-            boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
+            boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;

            // add it to the list
            try {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec {

                    // todo -- all of these on the fly parsing of the missing value should be static constants
                    if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
-                        if (i != 0)
-                            generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
                        genotypeAlleleLocation = i;
                    } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
                        GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec {
                }
            }

-            // check to make sure we found a gentoype field
-            // TODO -- This is no longer required in v4.1
-            if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
+            // check to make sure we found a genotype field if we are a VCF4.0 file
+            if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
+                generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
+            if ( genotypeAlleleLocation > 0 )
+                generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");

-            // todo -- assuming allele list length in the single digits is bad.  Fix me.
-            // Check for > 1 for haploid genotypes
-            boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
+            List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
+            boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;

            // add it to the list
            try {
-                genotypes.put(sampleName, new Genotype(sampleName,
-                        parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
-                        GTQual,
-                        genotypeFilters,
-                        gtAttributes,
-                        phased));
+                genotypes.put(sampleName,
+                        new Genotype(sampleName,
+                                GTalleles,
+                                GTQual,
+                                genotypeFilters,
+                                gtAttributes,
+                                phased));
            } catch (TribbleException e) {
                throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
            }
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@ -154,6 +154,16 @@ public class UserException extends ReviewedStingException {
        }
    }

+    public static class MalformedVCF extends UserException {
+        public MalformedVCF(String message, String line) {
+            super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
+        }
+
+        public MalformedVCF(String message, int lineNo) {
+            super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message));
+        }
+    }
+
    public static class ReadMissingReadGroup extends MalformedBAM {
        public ReadMissingReadGroup(SAMRecord read) {
            super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK.  Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext;

 import org.broad.tribble.util.ParsingUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

 import java.util.*;

@ -19,12 +20,14 @@ public class Genotype {
    protected InferredGeneticContext commonInfo;
    public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
    protected List<Allele> alleles = null; // new ArrayList<Allele>();
+    protected Type type = null;

    protected boolean isPhased = false;
-    private boolean filtersWereAppliedToContext;
+    protected boolean filtersWereAppliedToContext;

    public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
-        this.alleles = Collections.unmodifiableList(alleles);
+        if ( alleles != null )
+            this.alleles = Collections.unmodifiableList(alleles);
        commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
        filtersWereAppliedToContext = filters != null;
        this.isPhased = isPhased;
@ -66,6 +69,9 @@ public class Genotype {
    }

    public List<Allele> getAlleles(Allele allele) {
+        if ( getType() == Type.UNAVAILABLE )
+            throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
+
        List<Allele> al = new ArrayList<Allele>();
        for ( Allele a : alleles )
            if ( a.equals(allele) )
@ -75,6 +81,8 @@ public class Genotype {
    }

    public Allele getAllele(int i) {
+        if ( getType() == Type.UNAVAILABLE )
+            throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
        return alleles.get(i);
    }

@ -89,10 +97,21 @@ public class Genotype {
        NO_CALL,
        HOM_REF,
        HET,
-        HOM_VAR
+        HOM_VAR,
+        UNAVAILABLE
    }

    public Type getType() {
+        if ( type == null ) {
+            type = determineType();
+        }
+        return type;
+    }
+
+    protected Type determineType() {
+        if ( alleles == null )
+            return Type.UNAVAILABLE;
+
        Allele firstAllele = alleles.get(0);

        if ( firstAllele.isNoCall() ) {
@ -122,7 +141,8 @@ public class Genotype {
     * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
     */
    public boolean isNoCall() { return getType() == Type.NO_CALL; }
-    public boolean isCalled() { return getType() != Type.NO_CALL; }
+    public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
+    public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }

    //
    // Useful methods for getting genotype likelihoods for a genotype object, if present
@ -157,8 +177,8 @@ public class Genotype {
    }

    public void validate() {
-        if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles");
-        if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
+        if ( alleles == null ) return;
+        if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");

        int nNoCalls = 0;
        for ( Allele allele : alleles ) {
@ -175,6 +195,9 @@ public class Genotype {
    }

    public String getGenotypeString(boolean ignoreRefState) {
+        if ( alleles == null )
+            return null;
+
        // Notes:
        // 1. Make sure to use the appropriate separator depending on whether the genotype is phased
        // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati

            if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());

-            for ( Allele gAllele : g.getAlleles() ) {
-                if ( ! hasAllele(gAllele) && gAllele.isCalled() )
-                    throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
+            if ( g.isAvailable() ) {
+                for ( Allele gAllele : g.getAlleles() ) {
+                    if ( ! hasAllele(gAllele) && gAllele.isCalled() )
+                        throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
+                }
            }
        }
    }
--- a/public/java/test/org/broadinstitute/sting/WalkerTest.java
+++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java
@ -26,7 +26,9 @@
 package org.broadinstitute.sting;

 import org.apache.commons.lang.StringUtils;
+import org.broad.tribble.FeatureCodec;
 import org.broad.tribble.Tribble;
+import org.broad.tribble.index.Index;
 import org.broad.tribble.index.IndexFactory;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
 import org.broadinstitute.sting.gatk.CommandLineExecutable;
@ -64,10 +66,19 @@ public class WalkerTest extends BaseTest {
            }

            System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile);
-            Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()), "Index on disk from indexing on the fly not equal to the index created after the run completed");
+            Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec());
+            Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath());
+
+            if ( ! indexFromOutputFile.equals(dynamicIndex) ) {
+                Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed.  FileIndex %s vs. on-the-fly %s%n",
+                        indexFromOutputFile.getProperties(),
+                        dynamicIndex.getProperties()));
+            }
        }
    }

+
+
    public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) {
        List<String> md5s = new ArrayList<String>();
        for (int i = 0; i < resultFiles.size(); i++) {
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java
@ -87,7 +87,7 @@ public class DiffableReaderUnitTest extends BaseTest {
        Assert.assertSame(diff.getParent(), DiffElement.ROOT);

        DiffNode node = diff.getValueAsNode();
-        Assert.assertEquals(node.getElements().size(), 9);
+        Assert.assertEquals(node.getElements().size(), 10);

        // chr1    2646    rs62635284      G       A       0.15    PASS    AC=2;AF=1.00;AN=2       GT:AD:DP:GL:GQ  1/1:53,75:3:-12.40,-0.90,-0.00:9.03
        DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode();
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java
@ -23,7 +23,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("e312b7d3854d5b2834a370659514a813"));
+                Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4"));
        executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec);
    }

@ -33,7 +33,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("681f50e45f1d697370d2c355df2e18bc"));
+                Arrays.asList("53dd312468296826bdd3c22387390c88"));
        executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec);
    }

@ -43,7 +43,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("0bccb0ef928a108418246bec01098083"));
+                Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee"));
        executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec);
    }

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java
@ -23,7 +23,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("e16f957d888054ae0518e25660295241"));
+                Arrays.asList("af5e1370822551c0c6f50f23447dc627"));
        executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec);
    }

@ -33,7 +33,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("122a482090677c7619c2105d44e00d11"));
+                Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb"));
        executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec);
    }

@ -43,7 +43,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
                        + " -L chr20:556259-756570",
                1,
-                Arrays.asList("bc6a8c8a42bb2601db98e88e9ad74748"));
+                Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5"));
        executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec);
    }

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java
@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.variantutils;
-
-import org.broadinstitute.sting.WalkerTest;
-import org.testng.annotations.Test;
-
-import java.io.File;
-import java.util.Arrays;
-
-public class BatchMergeIntegrationTest extends WalkerTest {
-    @Test
-    public void testBatchMerge1() {
-        String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
-        String alleles = validationDataLocation +  "batch.merge.alleles.vcf";
-        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T UnifiedGenotyper -NO_HEADER -BTI alleles -stand_call_conf 0.0 -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference
-                        + " -B:alleles,VCF " + alleles
-                        + " -I " + bam,
-                1,
-                Arrays.asList("f4ed8f4ef2cba96823c06e90e9d0de35"));
-        executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec);
-    }
-}
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java
@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
    @Test
    public void testVariantsToVCFUsingGeliInput() {
        List<String> md5 = new ArrayList<String>();
-        md5.add("815b82fff92aab41c209eedce2d7e7d9");
+        md5.add("4accae035d271b35ee2ec58f403c68c6");

        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b36KGReference +
@ -38,7 +38,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
    @Test
    public void testGenotypesToVCFUsingGeliInput() {
        List<String> md5 = new ArrayList<String>();
-        md5.add("22336ee9c12aa222ce29c3c5babca7d0");
+        md5.add("71e8c98d7c3a73b6287ecc339086fe03");

        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b36KGReference +
@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
    @Test
    public void testGenotypesToVCFUsingHapMapInput() {
        List<String> md5 = new ArrayList<String>();
-        md5.add("9bedaa7670b86a07be5191898c3727cf");
+        md5.add("f343085305e80c7a2493422e4eaad983");

        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b36KGReference +
@ -73,7 +73,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
    @Test
    public void testGenotypesToVCFUsingVCFInput() {
        List<String> md5 = new ArrayList<String>();
-        md5.add("cc215edec9ca28e5c79ab1b67506f9f7");
+        md5.add("86f02e2e764ba35854cff2aa05a1fdd8");

        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-R " + b36KGReference +
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java
@ -0,0 +1,28 @@
+package org.broadinstitute.sting.utils.codecs.vcf;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+
+public class VCFIntegrationTest extends WalkerTest {
+
+    @Test
+    public void testReadingAndWritingWitHNoChanges() {
+
+        String md5ofInputVCF = "a990ba187a69ca44cb9bc2bb44d00447";
+        String testVCF = validationDataLocation + "vcf4.1.example.vcf";
+
+        String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s ";
+
+        String test1 = baseCommand + "-T VariantAnnotator -BTI variant -B:variant,vcf " + testVCF;
+        WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF));
+        List<File> result = executeTest("Test Variant Annotator with no changes", spec1).getFirst();
+
+        String test2 = baseCommand + "-T VariantsToVCF -B:variant,vcf " + result.get(0).getAbsolutePath();
+        WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF));
+        executeTest("Test Variants To VCF from new output", spec2);
+    }
+}
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.function.ListWriterFunction

-import scala.io.Source._
 import collection.JavaConversions._
 import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
 import org.broadinstitute.sting.queue.extensions.picard._
-import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
+import net.sf.samtools.{SAMFileReader}
 import net.sf.samtools.SAMFileHeader.SortOrder

+import org.broadinstitute.sting.queue.qscripts.utils.Utils

 class DataProcessingPipeline extends QScript {
  qscript =>
@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript {
                   val ds: String)
  {}

-  // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that)
-  def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
-    var sample: String = ""
-    for (r <- readGroups) {
-      if (sample.isEmpty)
-        sample = r.getSample
-      else if (sample != r.getSample)
-          return true;
-    }
-    return false
-  }
-
  // Utility function to merge all bam files of similar samples. Generates one BAM file per sample.
  // It uses the sample information on the header of the input BAM files.
  //
@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript {

      // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
      // with PrintReads to separate the samples. Tell user to do it himself!
-      assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
+      assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)

      // Fill out the sample table with the readgroups in this file
      for (rg <- readGroups) {
@ -147,20 +135,23 @@ class DataProcessingPipeline extends QScript {
      }
    }

+    println("\n\n*** DEBUG ***\n")
    // Creating one file for each sample in the dataset
    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
    for ((sample, flist) <- sampleTable) {
+
+      println(sample + ":")
+      for (f <- flist)
+        println (f)
+      println()
+
      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam")
      sampleBamFiles(sample) = sampleFileName
      add(joinBams(flist, sampleFileName))
    }
-    return sampleBamFiles.toMap
-  }
+    println("*** DEBUG ***\n\n")

-  // Checks how many contigs are in the dataset. Uses the BAM file header information.
-  def getNumberOfContigs(bamFile: File): Int = {
-    val samReader = new SAMFileReader(new File(bamFile))
-    return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
+    return sampleBamFiles.toMap
  }

  // Rebuilds the Read Group string to give BWA
@ -206,17 +197,6 @@ class DataProcessingPipeline extends QScript {
    return realignedBams
  }

-  // Reads a BAM LIST file and creates a scala list with all the files
-  def createListFromFile(in: File):List[File] = {
-    if (in.toString.endsWith("bam"))
-      return List(in)
-    var l: List[File] = List()
-    for (bam <- fromFile(in).getLines)
-      l :+= new File(bam)
-    return l
-  }
-
-

  /****************************************************************************
  * Main script
@ -226,17 +206,14 @@ class DataProcessingPipeline extends QScript {
  def script = {

    // keep a record of the number of contigs in the first bam file in the list
-    val bams = createListFromFile(input)
-    nContigs = getNumberOfContigs(bams(0))
+    val bams = Utils.createListFromFile(input)
+    nContigs = Utils.getNumberOfContigs(bams(0))

    val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams}

    // Generate a BAM file per sample joining all per lane files if necessary
    val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams)

-
-    println("nContigs: " + nContigs)
-
    // Final output list of processed bam files
    var cohortList: List[File] = List()

@ -244,6 +221,7 @@ class DataProcessingPipeline extends QScript {
    println("\nFound the following samples: ")
    for ((sample, file) <- sampleBamFiles)
      println("\t" + sample + " -> " + file)
+    println("\n")

    // If this is a 'knowns only' indel realignment run, do it only once for all samples.
    val globalIntervals = new File(outputDir + projectName + ".intervals")
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala
@ -3,6 +3,8 @@ package org.broadinstitute.sting.queue.qscripts
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
 import net.sf.samtools.SAMFileReader
+import io.Source._
+import org.broadinstitute.sting.queue.qscripts.utils.Utils

 /**
 * Created by IntelliJ IDEA.
@ -32,26 +34,25 @@ class RecalibrateBaseQualities extends QScript {
  val queueLogDir: String = ".qlog/"
  var nContigs: Int = 0

-  def getNumberOfContigs(bamFile: File): Int = {
-    val samReader = new SAMFileReader(new File(bamFile))
-    return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
-  }
-
  def script = {

-    nContigs = getNumberOfContigs(input)
+    val bamList = Utils.createListFromFile(input)
+    nContigs = Utils.getNumberOfContigs(bamList(0))

-    val recalFile1: File = swapExt(input, ".bam", "recal1.csv")
-    val recalFile2: File = swapExt(input, ".bam", "recal2.csv")
-    val recalBam: File   = swapExt(input, ".bam", "recal.bam")
-    val path1: String    = "before"
-    val path2: String    = "after"
-    
-    add(cov(input, recalFile1),
-        recal(input, recalFile1, recalBam),
-        cov(recalBam, recalFile2),
-        analyzeCovariates(recalFile1, path1),
-        analyzeCovariates(recalFile2, path2))
+    for (bam <- bamList) {
+
+      val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv")
+      val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv")
+      val recalBam: File   = swapExt(bam, ".bam", ".recal.bam")
+      val path1: String    = bam + "before"
+      val path2: String    = bam + "after"
+
+      add(cov(bam, recalFile1),
+          recal(bam, recalFile1, recalBam),
+          cov(recalBam, recalFile2),
+          analyzeCovariates(recalFile1, path1),
+          analyzeCovariates(recalFile2, path2))
+    }
  }

  trait CommandLineGATKArgs extends CommandLineGATK {
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala
@ -0,0 +1,60 @@
+package org.broadinstitute.sting.queue.qscripts.utils
+
+import java.io.File
+import io.Source._
+import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader}
+
+import collection.JavaConversions._
+
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 7/14/11
+ * Time: 4:57 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+object Utils {
+
+  /**
+   * Takes a bam list file and produces a scala list with each file allowing the bam list
+   * to have empty lines and comment lines (lines starting with #).
+   */
+  def createListFromFile(in: File):List[File] = {
+    // If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
+    // and return a list with only this file.
+    if (in.toString.endsWith(".bam"))
+      return List(in)
+
+    var list: List[File] = List()
+    for (bam <- fromFile(in).getLines)
+      if (!bam.startsWith("#") && !bam.isEmpty )
+        list :+= new File(bam.trim())
+    list
+  }
+
+  /**
+   * Returns the number of contigs in the BAM file header.
+   */
+  def getNumberOfContigs(bamFile: File): Int = {
+    val samReader = new SAMFileReader(new File(bamFile))
+    samReader.getFileHeader.getSequenceDictionary.getSequences.size()
+  }
+
+  /**
+   * Check if there are multiple samples in a BAM file
+   */
+  def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
+    var sample: String = ""
+    for (r <- readGroups) {
+      if (sample.isEmpty)
+        sample = r.getSample
+      else if (sample != r.getSample)
+          return true;
+    }
+    false
+  }
+
+
+}