merging now very close to working. Bug todo in writer and vcf infrastructure. Can almost create merged snp and indel files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3712 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-02 20:09:25 +00:00 · 2010-07-02 20:09:25 +00:00 · cd2e4b0a1e
parent b6bdd61283
commit cd2e4b0a1e
10 changed files with 123 additions and 31 deletions
--- a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java
+++ b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java
@ -14,7 +14,7 @@ import java.util.Map;
 *         <p/>
 *         A class representing a key=value entry for genotype FORMAT fields in the VCF header
 */
-public class VCFFormatHeaderLine extends VCFHeaderLine {
+public class VCFFormatHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {

    // the format field types
    public enum FORMAT_TYPE {
--- a/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java
+++ b/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java
@ -1,5 +1,7 @@
 package org.broad.tribble.vcf;

+import org.broadinstitute.sting.utils.StingException;
+
 import java.util.*;

 /**
@ -48,6 +50,9 @@ class VCF4Parser implements VCFLineParser {
        for (Map.Entry<String,?> entry : keyValues.entrySet()) {
            if (start) start = false;
            else builder.append(",");
+
+            if ( entry.getValue() == null ) throw new StingException("Header problem: unbound value at " + entry + " from " + keyValues);
+
            builder.append(entry.getKey());
            builder.append("=");
            builder.append(entry.getValue().toString().contains(",") ||
--- a/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java
+++ b/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java
@ -1,7 +1,5 @@
 package org.broad.tribble.vcf;

-import org.broad.tribble.util.ParsingUtils;
-
 import java.util.Arrays;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -14,7 +12,7 @@ import java.util.Map;
 *         <p/>
 *         A class representing a key=value entry for INFO fields in the VCF header
 */
-public class VCFInfoHeaderLine extends VCFHeaderLine {
+public class VCFInfoHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {

    // the info field types
    public enum INFO_TYPE {
@ -106,6 +104,7 @@ public class VCFInfoHeaderLine extends VCFHeaderLine {
               mType == other.mType;
    }

+    @Override
    public String getmName() {
        return mName;
    }
--- a/java/src/org/broad/tribble/vcf/VCFNamedHeaderLine.java
+++ b/java/src/org/broad/tribble/vcf/VCFNamedHeaderLine.java
@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broad.tribble.vcf;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: depristo
+ * Date: Jul 2, 2010
+ * Time: 2:40:45 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public interface VCFNamedHeaderLine {
+    String getmName();
+}
--- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@ -175,6 +175,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
            else if(inputFile.getName().toLowerCase().endsWith(".bam")) {
                unpackedReads.add( inputFile );
            }
+            else if(inputFile.getName().equals("-")) {
+                unpackedReads.add( new File("/dev/stdin") );
+            }
            else {
                Utils.scareUser(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
                                              "with the .list extension, but the file %s has neither extension.  Please ensure that your BAM file or list " +
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -728,9 +728,9 @@ public class GenomeAnalysisEngine {
                                             ValidationExclusion exclusions) {
        // Use monolithic sharding if no index is present.  Monolithic sharding is always required for the original
        // sharding system; it's required with the new sharding system only for locus walkers.
-        if(readsDataSource != null && !readsDataSource.hasIndex() && walker instanceof LocusWalker) {
+        if(readsDataSource != null && !readsDataSource.hasIndex() ) { 
            if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM) || intervals != null)
-                throw new StingException("The GATK cannot currently process unindexed BAM files");
+                throw new StingException("The GATK cannot currently process unindexed BAM files without the -U ALLOW_UNINDEXED_BAM, or with unindexed BAM files with the -L option");

            Shard.ShardType shardType;
            if(walker instanceof LocusWalker) {
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
@ -165,7 +165,7 @@ public class VariantContextUtils {
    }

    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
-        return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
+        return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false, false);
    }


@ -179,7 +179,7 @@ public class VariantContextUtils {
     * @param mergeOptions
     * @return
     */
-    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
+    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin, boolean printMessages ) {
        if ( unsortedVCs == null || unsortedVCs.size() == 0 )
            return null;

@ -256,7 +256,7 @@ public class VariantContextUtils {
            attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));

        VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
-        //if ( remapped ) System.out.printf("Remapped => %s%n", merged);
+        if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
        return merged;
    }

--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
@ -434,11 +434,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {

            double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
            Set<String> genotypeFilters = null;
-            String sampleName = sampleNameIterator.next();
-
-
-            // todo -- the parsing of attributes could be made lazy for performance
            Map<String, String> gtAttributes = null;
+            String sampleName = sampleNameIterator.next();

            // check to see if the value list is longer than the key list, which is a problem
            if (nGTKeys < GTValueSplitSize)
@ -457,8 +454,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
                            genotypeAlleleLocation = i;
                    else if (genotypeKeyArray[i].equals("GQ"))
                        GTQual = parseQual(GTValueArray[i]);
-                    else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
-                        genotypeFilters.addAll(parseFilters(GTValueArray[i]));
+                    else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
+                        genotypeFilters = parseFilters(GTValueArray[i]);
                    else
                        gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);

--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@ -25,8 +25,7 @@

 package org.broadinstitute.sting.gatk.walkers.variantutils;

-import org.broad.tribble.vcf.VCFHeader;
-import org.broad.tribble.vcf.VCFHeaderLine;
+import org.broad.tribble.vcf.*;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
@ -57,6 +56,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
    protected String PRIORITY_STRING = null;

+    @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
+    protected boolean printComplexMerges = false;
+
    private VCFWriter vcfWriter = null;
    private List<String> priority = null;
    protected EnumSet<VariantContextUtils.MergeType> mergeOptions;
@ -72,16 +74,21 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
        vcfWriter = new VCFWriter(out, true);
        priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));

+
+        // todo -- need to merge headers in an intelligent way
+
        validateAnnotateUnionArguments(priority);
        mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
-        Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
+        Map<String, VCFHeader> vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null);
+        Set<String> samples = getSampleList(vcfRods, mergeOptions);

-        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
-        metaData.add(new VCFHeaderLine("source", "VCFCombine"));
-        vcfWriter.writeHeader(new VCFHeader(metaData, samples));
+        Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
+        headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
+        vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
    }

-    private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
+    // todo -- Eric, where's a better place to put this?
+    public static Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
        Set<String> samples = new TreeSet<String>();
        for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
            VCFHeader header = val.getValue();
@ -93,6 +100,48 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
        return samples;
    }

+    // todo -- Eric, where's a better place to put this?
+    public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers) throws IllegalStateException {
+        HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
+        HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
+
+        for ( VCFHeader source : headers ) {
+            //System.out.printf("Merging in header %s%n", source);
+            for ( VCFHeaderLine line : source.getMetaData()) {
+                String key = line.getKey();
+                if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getmName();
+
+                if ( map.containsKey(key) ) {
+                    VCFHeaderLine other = map.get(key);
+                    if ( line.equals(other) )
+                        continue;
+//                        System.out.printf("equals duplicate key %s%n", line);
+                    else if ( ! line.getClass().equals(other.getClass()) )
+                        throw new IllegalStateException("Incompatible header types: " + line + " " + other );
+                    else if ( line instanceof VCFFilterHeaderLine ) {
+                        String lineName = ((VCFFilterHeaderLine) line).getmName();
+                        String otherName = ((VCFFilterHeaderLine) other).getmName();
+                        if ( ! lineName.equals(otherName) )
+                            throw new IllegalStateException("Incompatible header types: " + line + " " + other );
+                    } else {
+                        //String lineName = ((VCFInfoHeaderLine) line).getmName();
+                        //String otherName = ((VCFFilterHeaderLine) other).getmName();
+
+                        // todo -- aaron, please complete these comparisons when INFO and Format header lines are made into one
+                        //if ( (lineType != null && ! lineType.equals(otherType)) || (lineCount != null && !lineCounts.equals(otherCount)))
+                        //    throw new IllegalStateException("Incompatible header types: " + line + " " + other );
+                    }
+                } else {
+                    map.put(key, line);
+                    //System.out.printf("Adding header line %s%n", line);
+                }
+            }
+        }
+
+        return new HashSet<VCFHeaderLine>(map.values());
+    }
+
+
    private void validateAnnotateUnionArguments(List<String> priority) {
        Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
        if ( priority == null || rodNames.size() != priority.size() )
@ -108,7 +157,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {

        // get all of the vcf rods at this locus
        Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
-        VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
+        VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true, printComplexMerges);
        if ( mergedVC != null ) // only operate at the start of events
            //if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
            vcfWriter.add(mergedVC, ref.getBases());
--- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java
@ -28,6 +28,7 @@ public class VCFWriter {
    BufferedWriter mWriter;

    private boolean writingVCF40Format;
+    private String PASSES_FILTERS_STRING = null;

    // our genotype sample fields
    private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
@ -47,8 +48,8 @@ public class VCFWriter {

    // default values
    private static final String UNFILTERED = ".";
-    private static final String PASSES_FILTERS = "0";
-    private static final String PASSES_FILTERS_VCF_4_0 = "PASS";
+    private static final String PASSES_FILTERS_VCF3 = "0";
+    private static final String PASSES_FILTERS_VCF4 = "PASS";
    private static final String EMPTY_INFO_FIELD = ".";
    private static final String EMPTY_ID_FIELD = ".";
    private static final String EMPTY_ALLELE_FIELD = ".";
@ -66,12 +67,15 @@ public class VCFWriter {

    public VCFWriter(File location, boolean useVCF4Format) {
        this.writingVCF40Format = useVCF4Format;
+        this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
+
        FileOutputStream output;
        try {
            output = new FileOutputStream(location);
        } catch (FileNotFoundException e) {
            throw new RuntimeException("Unable to create VCF file at location: " + location);
        }
+
        mWriter = new BufferedWriter(new OutputStreamWriter(output));
    }

@ -87,6 +91,7 @@ public class VCFWriter {
    }
    public VCFWriter(OutputStream output, boolean useVCF4Format) {
        this.writingVCF40Format = useVCF4Format;
+        this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
        mWriter = new BufferedWriter(new OutputStreamWriter(output));
    }

@ -238,8 +243,8 @@ public class VCFWriter {
        // TODO- clean up these flags and associated code
        boolean filtersWereAppliedToContext = true;
        List<String> allowedGenotypeAttributeKeys = null;
-        boolean filtersWereAppliedToGenotypes = false;
-        String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
+        boolean filtersWereAppliedToGenotypes = true;
+        String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_STRING : UNFILTERED);

        Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
        alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
@ -251,8 +256,6 @@ public class VCFWriter {

        ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");

-
-
        // search for reference allele and find trailing and padding at the end.
        if (originalAlleles != null) {
            Allele originalReferenceAllele = null;
@ -339,7 +342,7 @@ public class VCFWriter {
                if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
                    vcfGenotypeAttributeKeys.add(key);
            }
-            if ( filtersWereAppliedToGenotypes )
+            if ( filtersWereAppliedToGenotypes ) // todo -- should be calculated
                vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
        }
        String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
@ -382,7 +385,7 @@ public class VCFWriter {
                        val = pileup.size();
                } else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
                    // VCF 4.0 key for no filters is "."
-                    val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : UNFILTERED;
+                    val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS_STRING;
                }