diff --git a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java index a31a2595c..9f60127d8 100755 --- a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java @@ -14,7 +14,7 @@ import java.util.Map; *

* A class representing a key=value entry for genotype FORMAT fields in the VCF header */ -public class VCFFormatHeaderLine extends VCFHeaderLine { +public class VCFFormatHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { // the format field types public enum FORMAT_TYPE { diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java b/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java index 9aa7ae1dc..872f8a979 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java @@ -1,5 +1,7 @@ package org.broad.tribble.vcf; +import org.broadinstitute.sting.utils.StingException; + import java.util.*; /** @@ -48,6 +50,9 @@ class VCF4Parser implements VCFLineParser { for (Map.Entry entry : keyValues.entrySet()) { if (start) start = false; else builder.append(","); + + if ( entry.getValue() == null ) throw new StingException("Header problem: unbound value at " + entry + " from " + keyValues); + builder.append(entry.getKey()); builder.append("="); builder.append(entry.getValue().toString().contains(",") || diff --git a/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java index 8a460879d..b788e8156 100755 --- a/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java @@ -1,7 +1,5 @@ package org.broad.tribble.vcf; -import org.broad.tribble.util.ParsingUtils; - import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -14,7 +12,7 @@ import java.util.Map; *

* A class representing a key=value entry for INFO fields in the VCF header */ -public class VCFInfoHeaderLine extends VCFHeaderLine { +public class VCFInfoHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { // the info field types public enum INFO_TYPE { @@ -106,6 +104,7 @@ public class VCFInfoHeaderLine extends VCFHeaderLine { mType == other.mType; } + @Override public String getmName() { return mName; } diff --git a/java/src/org/broad/tribble/vcf/VCFNamedHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFNamedHeaderLine.java new file mode 100755 index 000000000..8e9f1e20c --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFNamedHeaderLine.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broad.tribble.vcf; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jul 2, 2010 + * Time: 2:40:45 PM + * To change this template use File | Settings | File Templates. + */ +public interface VCFNamedHeaderLine { + String getmName(); +} diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 341ea8214..9784058e5 100644 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -175,6 +175,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram { else if(inputFile.getName().toLowerCase().endsWith(".bam")) { unpackedReads.add( inputFile ); } + else if(inputFile.getName().equals("-")) { + unpackedReads.add( new File("/dev/stdin") ); + } else { Utils.scareUser(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 445884459..4ea246108 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -728,9 +728,9 @@ public class GenomeAnalysisEngine { ValidationExclusion exclusions) { // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original // sharding system; it's required with the new sharding system only for locus walkers. - if(readsDataSource != null && !readsDataSource.hasIndex() && walker instanceof LocusWalker) { + if(readsDataSource != null && !readsDataSource.hasIndex() ) { if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM) || intervals != null) - throw new StingException("The GATK cannot currently process unindexed BAM files"); + throw new StingException("The GATK cannot currently process unindexed BAM files without the -U ALLOW_UNINDEXED_BAM, or with unindexed BAM files with the -L option"); Shard.ShardType shardType; if(walker instanceof LocusWalker) { diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 085515fc1..f4a261aa0 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -165,7 +165,7 @@ public class VariantContextUtils { } public static VariantContext simpleMerge(Collection unsortedVCs) { - return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false); + return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false, false); } @@ -179,7 +179,7 @@ public class VariantContextUtils { * @param mergeOptions * @return */ - public static VariantContext simpleMerge(Collection unsortedVCs, List priorityListOfVCs, EnumSet mergeOptions, boolean annotateOrigin ) { + public static VariantContext simpleMerge(Collection unsortedVCs, List priorityListOfVCs, EnumSet mergeOptions, boolean annotateOrigin, boolean printMessages ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -256,7 +256,7 @@ public class VariantContextUtils { attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); - //if ( remapped ) System.out.printf("Remapped => %s%n", merged); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 43cfcba78..18797cd1b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -434,11 +434,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { double GTQual = VariantContext.NO_NEG_LOG_10PERROR; Set genotypeFilters = null; - String sampleName = sampleNameIterator.next(); - - - // todo -- the parsing of attributes could be made lazy for performance Map gtAttributes = null; + String sampleName = sampleNameIterator.next(); // check to see if the value list is longer than the key list, which is a problem if (nGTKeys < GTValueSplitSize) @@ -457,8 +454,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { genotypeAlleleLocation = i; else if (genotypeKeyArray[i].equals("GQ")) GTQual = parseQual(GTValueArray[i]); - else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here - genotypeFilters.addAll(parseFilters(GTValueArray[i])); + else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here + genotypeFilters = parseFilters(GTValueArray[i]); else gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 958db810e..456b0748f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -25,8 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; @@ -57,6 +56,9 @@ public class CombineVariants extends RodWalker { @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true) protected String PRIORITY_STRING = null; + @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) + protected boolean printComplexMerges = false; + private VCFWriter vcfWriter = null; private List priority = null; protected EnumSet mergeOptions; @@ -72,16 +74,21 @@ public class CombineVariants extends RodWalker { vcfWriter = new VCFWriter(out, true); priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); + + // todo -- need to merge headers in an intelligent way + validateAnnotateUnionArguments(priority); mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions; - Set samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions); + Map vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null); + Set samples = getSampleList(vcfRods, mergeOptions); - Set metaData = new HashSet(); - metaData.add(new VCFHeaderLine("source", "VCFCombine")); - vcfWriter.writeHeader(new VCFHeader(metaData, samples)); + Set headerLines = smartMergeHeaders(vcfRods.values()); + headerLines.add(new VCFHeaderLine("source", "CombineVariants")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); } - private Set getSampleList(Map headers, EnumSet mergeOptions ) { + // todo -- Eric, where's a better place to put this? + public static Set getSampleList(Map headers, EnumSet mergeOptions ) { Set samples = new TreeSet(); for ( Map.Entry val : headers.entrySet() ) { VCFHeader header = val.getValue(); @@ -93,6 +100,48 @@ public class CombineVariants extends RodWalker { return samples; } + // todo -- Eric, where's a better place to put this? + public static Set smartMergeHeaders(Collection headers) throws IllegalStateException { + HashMap map = new HashMap(); // from KEY.NAME -> line + HashSet lines = new HashSet(); + + for ( VCFHeader source : headers ) { + //System.out.printf("Merging in header %s%n", source); + for ( VCFHeaderLine line : source.getMetaData()) { + String key = line.getKey(); + if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getmName(); + + if ( map.containsKey(key) ) { + VCFHeaderLine other = map.get(key); + if ( line.equals(other) ) + continue; +// System.out.printf("equals duplicate key %s%n", line); + else if ( ! line.getClass().equals(other.getClass()) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + else if ( line instanceof VCFFilterHeaderLine ) { + String lineName = ((VCFFilterHeaderLine) line).getmName(); + String otherName = ((VCFFilterHeaderLine) other).getmName(); + if ( ! lineName.equals(otherName) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + } else { + //String lineName = ((VCFInfoHeaderLine) line).getmName(); + //String otherName = ((VCFFilterHeaderLine) other).getmName(); + + // todo -- aaron, please complete these comparisons when INFO and Format header lines are made into one + //if ( (lineType != null && ! lineType.equals(otherType)) || (lineCount != null && !lineCounts.equals(otherCount))) + // throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + } + } else { + map.put(key, line); + //System.out.printf("Adding header line %s%n", line); + } + } + } + + return new HashSet(map.values()); + } + + private void validateAnnotateUnionArguments(List priority) { Set rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null); if ( priority == null || rodNames.size() != priority.size() ) @@ -108,7 +157,7 @@ public class CombineVariants extends RodWalker { // get all of the vcf rods at this locus Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); - VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true); + VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true, printComplexMerges); if ( mergedVC != null ) // only operate at the start of events //if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed vcfWriter.add(mergedVC, ref.getBases()); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index f96e49752..65ff88cab 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -28,6 +28,7 @@ public class VCFWriter { BufferedWriter mWriter; private boolean writingVCF40Format; + private String PASSES_FILTERS_STRING = null; // our genotype sample fields private static final List mGenotypeRecords = new ArrayList(); @@ -47,8 +48,8 @@ public class VCFWriter { // default values private static final String UNFILTERED = "."; - private static final String PASSES_FILTERS = "0"; - private static final String PASSES_FILTERS_VCF_4_0 = "PASS"; + private static final String PASSES_FILTERS_VCF3 = "0"; + private static final String PASSES_FILTERS_VCF4 = "PASS"; private static final String EMPTY_INFO_FIELD = "."; private static final String EMPTY_ID_FIELD = "."; private static final String EMPTY_ALLELE_FIELD = "."; @@ -66,12 +67,15 @@ public class VCFWriter { public VCFWriter(File location, boolean useVCF4Format) { this.writingVCF40Format = useVCF4Format; + this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3; + FileOutputStream output; try { output = new FileOutputStream(location); } catch (FileNotFoundException e) { throw new RuntimeException("Unable to create VCF file at location: " + location); } + mWriter = new BufferedWriter(new OutputStreamWriter(output)); } @@ -87,6 +91,7 @@ public class VCFWriter { } public VCFWriter(OutputStream output, boolean useVCF4Format) { this.writingVCF40Format = useVCF4Format; + this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3; mWriter = new BufferedWriter(new OutputStreamWriter(output)); } @@ -238,8 +243,8 @@ public class VCFWriter { // TODO- clean up these flags and associated code boolean filtersWereAppliedToContext = true; List allowedGenotypeAttributeKeys = null; - boolean filtersWereAppliedToGenotypes = false; - String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED); + boolean filtersWereAppliedToGenotypes = true; + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_STRING : UNFILTERED); Map alleleMap = new HashMap(); alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup @@ -251,8 +256,6 @@ public class VCFWriter { ArrayList originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST"); - - // search for reference allele and find trailing and padding at the end. if (originalAlleles != null) { Allele originalReferenceAllele = null; @@ -339,7 +342,7 @@ public class VCFWriter { if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) ) vcfGenotypeAttributeKeys.add(key); } - if ( filtersWereAppliedToGenotypes ) + if ( filtersWereAppliedToGenotypes ) // todo -- should be calculated vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY); } String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); @@ -382,7 +385,7 @@ public class VCFWriter { val = pileup.size(); } else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) { // VCF 4.0 key for no filters is "." - val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : UNFILTERED; + val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS_STRING; }