diff --git a/java/src/org/broad/tribble/vcf/NameAwareCodec.java b/java/src/org/broad/tribble/vcf/NameAwareCodec.java new file mode 100755 index 000000000..348866421 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/NameAwareCodec.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broad.tribble.vcf; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 29, 2010 + * Time: 3:48:47 PM + * To change this template use File | Settings | File Templates. + */ +public interface NameAwareCodec { + public String getName(); + public void setName(String name); +} diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java index 9dd2f0202..eaf8f8d69 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java @@ -12,6 +12,11 @@ public class MutableGenotype extends Genotype { super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased()); } + public MutableGenotype(String sampleName, Genotype parent) { + super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased()); + } + + public MutableGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); } diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index f5ead41c2..52e8bc086 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -27,6 +27,8 @@ import java.util.*; import org.apache.commons.jexl2.*; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation; import org.broad.tribble.vcf.VCFRecord; @@ -158,34 +160,59 @@ public class VariantContextUtils { return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } + public enum MergeType { + UNION_VARIANTS, INTERSECT_VARIANTS, UNIQUIFY_GENOTYPES, PRIORITIZE_GENOTYPES, UNSORTED_GENOTYPES + } - public static VariantContext simpleMerge(Set VCs) { - if ( VCs == null || VCs.size() == 0 ) + public static VariantContext simpleMerge(Collection unsortedVCs) { + return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false); + } + + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param unsortedVCs + * @param priorityListOfVCs + * @param mergeOptions + * @return + */ + public static VariantContext simpleMerge(Collection unsortedVCs, List priorityListOfVCs, EnumSet mergeOptions, boolean annotateOrigin ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; - Iterator iter = VCs.iterator(); + if ( annotateOrigin && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins with a complete priority list of VariantContexts"); + + List VCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, mergeOptions); // establish the baseline info from the first VC - VariantContext first = iter.next(); + VariantContext first = VCs.get(0); String name = first.getName(); GenomeLoc loc = first.getLocation(); - Set alleles = new HashSet(first.getAlleles()); - Map genotypes = new HashMap(first.getGenotypes()); - double negLog10PError = first.isVariant() ? first.getNegLog10PError() : -1; - Set filters = new HashSet(first.getFilters()); + + Set alleles = new HashSet(); + Map genotypes = new HashMap(); + double negLog10PError = -1; + Set filters = new HashSet(); Map attributes = new HashMap(); int depth = 0; - if ( first.hasAttribute(VCFRecord.DEPTH_KEY) ) - depth = Integer.valueOf(first.getAttribute(VCFRecord.DEPTH_KEY).toString()); + + // filtering values + int nFiltered = 0; // cycle through and add info from the other VCs, making sure the loc/reference matches - while ( iter.hasNext() ) { - VariantContext vc = iter.next(); - if ( !loc.equals(vc.getLocation()) || !first.getReference().equals(vc.getReference()) ) - return null; + for ( VariantContext vc : VCs ) { + if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) ) + throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + nFiltered += vc.isFiltered() ? 1 : 0; alleles.addAll(vc.getAlleles()); - genotypes.putAll(vc.getGenotypes()); + + mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); @@ -194,8 +221,74 @@ public class VariantContextUtils { depth += Integer.valueOf(vc.getAttribute(VCFRecord.DEPTH_KEY).toString()); } + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( mergeOptions.contains(MergeType.UNION_VARIANTS) && nFiltered != VCs.size() ) + filters.clear(); + + // we care about where the call came from + if ( annotateOrigin ) { + String setValue = ""; + if ( nFiltered == 0 && VCs.size() == priorityListOfVCs.size() ) // nothing was unfiltered + setValue = "Intersection"; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = "FilteredInAll"; + else { // we are filtered in some subset + List s = new ArrayList(); + for ( VariantContext vc : VCs ) + s.add( vc.isFiltered() ? "filterIn" + vc.getName() : vc.getName() ); + setValue = Utils.join("-", s); + } + + attributes.put("set", setValue); + } + if ( depth > 0 ) attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); } + + static class CompareByPriority implements Comparator { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getName()); + if ( i == -1 ) throw new StingException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getName()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return new Integer(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, EnumSet mergeOptions ) { + if ( mergeOptions.contains(MergeType.PRIORITIZE_GENOTYPES) && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOptions.contains(MergeType.UNSORTED_GENOTYPES) ) + return new ArrayList(unsortedVCs); + else { + ArrayList sorted = new ArrayList(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) { + for ( Genotype g : oneVC.getGenotypes().values() ) { + String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples); + if ( ! mergedGenotypes.containsKey(name) ) { + // only add if the name is new + Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g); + mergedGenotypes.put(name, newG); + } + } + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { + return uniqify ? sampleName + "." + trackName : sampleName; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index e8c834653..e079bcac7 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -199,6 +199,16 @@ public class RefMetaDataTracker { return getAllVariantContexts(ref, null, null, false, false); } + /** + * Returns all of the variant contexts that start at the current location + * @param ref + * @param curLocation + * @return + */ + public Collection getAllVariantContexts(ReferenceContext ref, GenomeLoc curLocation) { + return getAllVariantContexts(ref, null, curLocation, true, false); + } + /** * Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only * VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 97e874542..b77fa6249 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -23,7 +23,7 @@ import java.util.*; * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. */ -public class VCF4Codec implements FeatureCodec { +public class VCF4Codec implements FeatureCodec, NameAwareCodec { // a variant context flag for original allele strings public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST"; @@ -62,7 +62,7 @@ public class VCF4Codec implements FeatureCodec { private final boolean validateFromHeader = false; // we store a name to give to each of the variant contexts we emit - private String name = "Unkown"; + private String name = "Unknown"; private int lineNo = 0; @@ -224,12 +224,12 @@ public class VCF4Codec implements FeatureCodec { String[] split = str.split(","); for (String substring : split) { VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key); - objects.add(type.convert(substring)); + objects.add(type != null ? type.convert(substring) : substring); } value = objects; } else { VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key); - value = type.convert(str); + value = type != null ? type.convert(str) : str; } //System.out.printf("%s %s%n", key, value); } else { @@ -366,7 +366,7 @@ public class VCF4Codec implements FeatureCodec { * @return a variant context object */ private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) { - try { +// try { // increment the line count lineNo++; @@ -403,9 +403,10 @@ public class VCF4Codec implements FeatureCodec { } return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); - } catch ( Exception e ) { - throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e); - } +// } catch ( Exception e ) { +// // todo -- we need a local exception so that we can remember the location of the throw but also see the line +// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e); +// } } /** @@ -545,7 +546,7 @@ public class VCF4Codec implements FeatureCodec { if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class); return this.header; } - + /** * get the name of this codec * @return our set name diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java index c4c59933f..f0fcb5ebd 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java @@ -120,4 +120,8 @@ public abstract class RMDTrack { public HeaderType getHeader(Class clazz) throws ClassCastException { return null; } + + public Object getHeader() { + return null; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java index d5cb82894..f9758d263 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java @@ -32,6 +32,7 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.linear.LinearIndex; import org.broad.tribble.index.linear.LinearIndexCreator; import org.broad.tribble.readers.BasicFeatureReader; +import org.broad.tribble.vcf.NameAwareCodec; import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; @@ -95,9 +96,13 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen @Override public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { // return a feature reader track - Pair pair = createFeatureReader(targetClass, inputFile); + Pair pair = createFeatureReader(targetClass, name, inputFile); if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile); - return new TribbleTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, pair.first, pair.second); + return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second); + } + + public Pair createFeatureReader(Class targetClass, File inputFile) { + return createFeatureReader(targetClass, "anonymous", inputFile); } /** @@ -106,12 +111,12 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen * @param inputFile the input file to create the track from (of the codec type) * @return the FeatureReader instance */ - public Pair createFeatureReader(Class targetClass, File inputFile) { + public Pair createFeatureReader(Class targetClass, String name, File inputFile) { Pair pair = null; if (inputFile.getAbsolutePath().endsWith(".gz")) - pair = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile); + pair = createBasicFeatureReaderNoAssumedIndex(targetClass, name, inputFile); else - pair = getLinearFeatureReader(targetClass, inputFile); + pair = getLinearFeatureReader(targetClass, name, inputFile); return pair; } @@ -124,27 +129,34 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen * @param inputFile the file to load * @return a feature reader implementation */ - private Pair createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) { + private Pair createBasicFeatureReaderNoAssumedIndex(Class targetClass, String name, File inputFile) { // we might not know the index type, try loading with the default reader constructor logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); try { - return new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)),null); + return new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null); } catch (IOException e) { throw new StingException("Unable to create feature reader from file " + inputFile); } } + private FeatureCodec createCodec(Class targetClass, String name) { + FeatureCodec codex = this.createByType(targetClass); + if ( codex instanceof NameAwareCodec ) + ((NameAwareCodec)codex).setName(name); + return codex; + } + /** * create a linear feature reader, where we create the index ahead of time * @param targetClass the target class * @param inputFile the tribble file to parse * @return the input file as a FeatureReader */ - private Pair getLinearFeatureReader(Class targetClass, File inputFile) { + private Pair getLinearFeatureReader(Class targetClass, String name, File inputFile) { Pair reader; try { - Index index = loadIndex(inputFile, this.createByType(targetClass), true); - reader = new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)),index.getSequenceDictionary()); + Index index = loadIndex(inputFile, createCodec(targetClass, name), true); + reader = new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary()); } catch (FileNotFoundException e) { throw new StingException("Unable to create reader with file " + inputFile, e); } catch (IOException e) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java index 3d32ef27d..8ef032f35 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java @@ -27,21 +27,18 @@ package org.broadinstitute.sting.gatk.walkers.vcf; import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.genotype.vcf.*; -import java.io.File; import java.util.*; /** @@ -51,191 +48,86 @@ import java.util.*; * priority list (if provided), emits a single record instance at every position represented in the rods. */ @Requires(value={}) -public class VCFCombine extends RodWalker { +public class VCFCombine extends RodWalker { // the types of combinations we currently allow - public enum COMBINATION_TYPE { - UNION, MERGE - } + public enum ComboType { UNION, MERGE } + @Argument(fullName="combination_type", shortName="type", doc="combination type; MERGE are supported", required=true) + protected ComboType COMBO_TYPE; - @Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false) - protected File OUTPUT_FILE = null; - - @Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true) - protected COMBINATION_TYPE COMBO_TYPE; - - @Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; a complete priority list MUST be provided", required=false) + @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true) protected String PRIORITY_STRING = null; - @Argument(fullName="annotateUnion", shortName="A", doc="For the UNION combination type: if provided, the output union VCF will contain venn information about where each call came from", required=false) - protected boolean annotateUnion = false; - private VCFWriter vcfWriter = null; - private String[] priority = null; - - // a map of rod name to uniquified sample name - private HashMap, String> rodNamesToSampleNames; + private List priority = null; + protected EnumSet mergeOptions; + protected final static EnumSet mergeTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES); + protected final static EnumSet unionTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.PRIORITIZE_GENOTYPES); public void initialize() { + //Set hInfo = new HashSet(); + //hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + + vcfWriter = new VCFWriter(out); + priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); + + validateAnnotateUnionArguments(priority); + mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions; + Set samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions); + Set metaData = new HashSet(); - metaData.add(new VCFHeaderLine("source", "VCF")); - - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - - if ( OUTPUT_FILE != null ) - vcfWriter = new VCFWriter(OUTPUT_FILE); - else - vcfWriter = new VCFWriter(out); - - if ( PRIORITY_STRING != null ) - priority = PRIORITY_STRING.split(","); - - Set samples; - switch (COMBO_TYPE ) { - case MERGE: - samples = new TreeSet(); - rodNamesToSampleNames = new HashMap, String>(); - // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) - SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); - break; - case UNION: - if ( annotateUnion ) { - validateAnnotateUnionArguments(priority); - } - - samples = SampleUtils.getUniqueSamplesFromRods(getToolkit()); - break; - default: - throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE); - } - - vcfWriter.writeHeader(new VCFHeader(hInfo, samples)); + metaData.add(new VCFHeaderLine("source", "VCFCombine")); + vcfWriter.writeHeader(new VCFHeader(metaData, samples)); } - private void validateAnnotateUnionArguments(String[] priority) { - Set rods = VCFUtils.getRodVCFs(getToolkit()); - if ( priority == null || rods.size() != priority.length ) { - throw new StingException("A complete priority list must be provided when annotateUnion is provided"); - } - if ( priority.length != 2 ) { - throw new StingException("When annotateUnion is provided only 2 VCF files can be merged"); - } - - for ( String p : priority ) { - boolean good = false; - for ( RMDTrack data : rods ) { - if ( p.equals(data.getName()) ) - good = true; + private Set getSampleList(Map headers, EnumSet mergeOptions ) { + Set samples = new HashSet(); + for ( Map.Entry val : headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOptions.contains(VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES))); } - if ( ! good ) throw new StingException("Priority item not provided as a ROD! " + p); } + + return samples; } - public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + private void validateAnnotateUnionArguments(List priority) { + Set rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null); + if ( priority == null || rodNames.size() != priority.size() ) + throw new StingException("A complete priority list must be provided when annotateUnion is provided"); + + if ( ! rodNames.containsAll(rodNames) ) + throw new StingException("Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls - return null; + return 0; // get all of the vcf rods at this locus - Map vcfRods = new LinkedHashMap(); - Iterator rods = tracker.getAllRods().iterator(); - while (rods.hasNext()) { - GATKFeature feat = rods.next(); - Object rod = feat.getUnderlyingObject(); - if ( rod instanceof VCFRecord ) - vcfRods.put((VCFRecord)rod,feat.getName()); - } + Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); + VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true); + if ( mergedVC != null ) // only operate at the start of events + if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed + vcfWriter.add(mergedVC, ref.getBases()); + else + logger.info(String.format("Ignoring complex event: " + mergedVC)); - if ( vcfRods.size() == 0 ) - return null; - - VCFRecord record = null; - switch (COMBO_TYPE ) { - case MERGE: - record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames); - break; - case UNION: - record = vcfUnion(vcfRods); - break; - default: - break; - } - - return record; + return vcs.isEmpty() ? 0 : 1; } - public VCFWriter reduceInit() { - return vcfWriter; + public Integer reduceInit() { + return 0; } - private VCFRecord vcfUnion(Map rods) { - if ( priority == null ) - return rods.keySet().iterator().next(); - - if ( annotateUnion ) { - Map rodMap = new HashMap(); - for ( VCFRecord vcf : rods.keySet() ) { - rodMap.put(rods.get(vcf),vcf); - } - - String priority1 = priority[0]; - String priority2 = priority[1]; - VCFRecord vcf1 = rodMap.containsKey(priority1) ? rodMap.get(priority1) : null; - VCFRecord vcf2 = rodMap.containsKey(priority2) ? rodMap.get(priority2) : null; - - // for simplicity, we are setting set and call for vcf1 - String set = priority1; - VCFRecord call = vcf1; - - if ( vcf1 == null ) { - if ( vcf2 == null ) - //return null; - throw new StingException("BUG: VCF1 and VCF2 are both null!"); - else { - set = priority2; - call = vcf2; - } - } else if ( vcf1.isFiltered() ) { - if ( vcf2 != null ) { - if ( vcf2.isFiltered() ) { - set = "filteredInBoth"; - } else { - set = priority2 + "-filteredInOther"; - call = vcf2; - } - } - } else { // good call - if ( vcf2 != null ) { - if ( vcf2.isFiltered() ) - set = priority1 + "-filteredInOther"; - else - set = "Intersection"; - } - } - - call.addInfoField("set", set); - return call; - } else { - for ( String rodname : priority ) { - for ( VCFRecord rod : rods.keySet() ) { - if ( rods.get(rod).equals(rodname) ) - return rod; - } - } - } - - return null; + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; } - public VCFWriter reduce(VCFRecord record, VCFWriter writer) { - if ( record != null ) - writer.addRecord(record); - return writer; - } - - public void onTraversalDone(VCFWriter writer) { - if ( writer != null ) - writer.close(); + public void onTraversalDone(Integer sum) { + if ( vcfWriter != null ) + vcfWriter.close(); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 31d106c53..7cf41cac8 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -87,6 +86,19 @@ public class SampleUtils { public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set rodNames) { Set samples = new TreeSet(); + for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() ) + samples.addAll(header.getGenotypeSamples()); + + return samples; + } + + public static Set getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set rodNames) { + return getRodsWithVCFHeader(toolkit, rodNames).keySet(); + } + + public static Map getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set rodNames) { + Map data = new HashMap(); + // iterate to get all of the sample names List dataSources = toolkit.getRodDataSources(); for ( ReferenceOrderedDataSource source : dataSources ) { @@ -95,13 +107,23 @@ public class SampleUtils { continue; RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getRecordType().equals(VCFRecord.class) ) - samples.addAll(rod.getHeader(VCFHeader.class).getGenotypeSamples()); + if ( containsVCFHeader(rod) ) + data.put(rod.getName(), rod.getHeader(VCFHeader.class)); } - return samples; + return data; } + // todo -- remove when we can actually just get the header itself from tribble + private static boolean containsVCFHeader(RMDTrack rod) { + try { + return rod.getHeader(VCFHeader.class) != null; + } catch ( ClassCastException e ) { + return false; + } + } + + /** * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap * (e.g. sampleX.1, sampleX.2, ...) @@ -118,14 +140,11 @@ public class SampleUtils { HashMap sampleOverlapMap = new HashMap(); // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getRecordType().equals(VCFRecord.class) ) { - Set vcfSamples = rod.getHeader(VCFHeader.class).getGenotypeSamples(); - for ( String sample : vcfSamples ) - addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); - } + + for ( Map.Entry pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) { + Set vcfSamples = pair.getValue().getGenotypeSamples(); + for ( String sample : vcfSamples ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); } }