Preliminary commit of new VCFCombine, soon to be called CombineVariants (next commit) that support merging any number of VCF files via a general VC merge routine that support prioritization and merging of samples! It's now possible to merge the pilot1/2/3 call sets into a single (monster) VCF taking genotypes from pilot2, then pilot3, then pilot1 as needed.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3690 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-06-30 20:13:03 +00:00
parent a46e22ed13
commit b8d6a95e7a
9 changed files with 284 additions and 211 deletions

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broad.tribble.vcf;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 29, 2010
* Time: 3:48:47 PM
* To change this template use File | Settings | File Templates.
*/
public interface NameAwareCodec {
public String getName();
public void setName(String name);
}

View File

@ -12,6 +12,11 @@ public class MutableGenotype extends Genotype {
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
}
public MutableGenotype(String sampleName, Genotype parent) {
super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
}
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
}

View File

@ -27,6 +27,8 @@ import java.util.*;
import org.apache.commons.jexl2.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
import org.broad.tribble.vcf.VCFRecord;
@ -158,34 +160,59 @@ public class VariantContextUtils {
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
}
public enum MergeType {
UNION_VARIANTS, INTERSECT_VARIANTS, UNIQUIFY_GENOTYPES, PRIORITIZE_GENOTYPES, UNSORTED_GENOTYPES
}
public static VariantContext simpleMerge(Set<VariantContext> VCs) {
if ( VCs == null || VCs.size() == 0 )
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
}
/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name
*
* @param unsortedVCs
* @param priorityListOfVCs
* @param mergeOptions
* @return
*/
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null;
Iterator<VariantContext> iter = VCs.iterator();
if ( annotateOrigin && priorityListOfVCs == null )
throw new IllegalArgumentException("Cannot merge calls and annotate their origins with a complete priority list of VariantContexts");
List<VariantContext> VCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, mergeOptions);
// establish the baseline info from the first VC
VariantContext first = iter.next();
VariantContext first = VCs.get(0);
String name = first.getName();
GenomeLoc loc = first.getLocation();
Set<Allele> alleles = new HashSet<Allele>(first.getAlleles());
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(first.getGenotypes());
double negLog10PError = first.isVariant() ? first.getNegLog10PError() : -1;
Set<String> filters = new HashSet<String>(first.getFilters());
Set<Allele> alleles = new HashSet<Allele>();
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
double negLog10PError = -1;
Set<String> filters = new HashSet<String>();
Map<String, String> attributes = new HashMap<String, String>();
int depth = 0;
if ( first.hasAttribute(VCFRecord.DEPTH_KEY) )
depth = Integer.valueOf(first.getAttribute(VCFRecord.DEPTH_KEY).toString());
// filtering values
int nFiltered = 0;
// cycle through and add info from the other VCs, making sure the loc/reference matches
while ( iter.hasNext() ) {
VariantContext vc = iter.next();
if ( !loc.equals(vc.getLocation()) || !first.getReference().equals(vc.getReference()) )
return null;
for ( VariantContext vc : VCs ) {
if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) )
throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
nFiltered += vc.isFiltered() ? 1 : 0;
alleles.addAll(vc.getAlleles());
genotypes.putAll(vc.getGenotypes());
mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));
negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1);
@ -194,8 +221,74 @@ public class VariantContextUtils {
depth += Integer.valueOf(vc.getAttribute(VCFRecord.DEPTH_KEY).toString());
}
// if at least one record was unfiltered and we want a union, clear all of the filters
if ( mergeOptions.contains(MergeType.UNION_VARIANTS) && nFiltered != VCs.size() )
filters.clear();
// we care about where the call came from
if ( annotateOrigin ) {
String setValue = "";
if ( nFiltered == 0 && VCs.size() == priorityListOfVCs.size() ) // nothing was unfiltered
setValue = "Intersection";
else if ( nFiltered == VCs.size() ) // everything was filtered out
setValue = "FilteredInAll";
else { // we are filtered in some subset
List<String> s = new ArrayList<String>();
for ( VariantContext vc : VCs )
s.add( vc.isFiltered() ? "filterIn" + vc.getName() : vc.getName() );
setValue = Utils.join("-", s);
}
attributes.put("set", setValue);
}
if ( depth > 0 )
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
}
static class CompareByPriority implements Comparator<VariantContext> {
List<String> priorityListOfVCs;
public CompareByPriority(List<String> priorityListOfVCs) {
this.priorityListOfVCs = priorityListOfVCs;
}
private int getIndex(VariantContext vc) {
int i = priorityListOfVCs.indexOf(vc.getName());
if ( i == -1 ) throw new StingException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getName());
return i;
}
public int compare(VariantContext vc1, VariantContext vc2) {
return new Integer(getIndex(vc1)).compareTo(getIndex(vc2));
}
}
public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions ) {
if ( mergeOptions.contains(MergeType.PRIORITIZE_GENOTYPES) && priorityListOfVCs == null )
throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
if ( priorityListOfVCs == null || mergeOptions.contains(MergeType.UNSORTED_GENOTYPES) )
return new ArrayList<VariantContext>(unsortedVCs);
else {
ArrayList<VariantContext> sorted = new ArrayList<VariantContext>(unsortedVCs);
Collections.sort(sorted, new CompareByPriority(priorityListOfVCs));
return sorted;
}
}
private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) {
for ( Genotype g : oneVC.getGenotypes().values() ) {
String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples);
if ( ! mergedGenotypes.containsKey(name) ) {
// only add if the name is new
Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g);
mergedGenotypes.put(name, newG);
}
}
}
public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) {
return uniqify ? sampleName + "." + trackName : sampleName;
}
}

View File

@ -199,6 +199,16 @@ public class RefMetaDataTracker {
return getAllVariantContexts(ref, null, null, false, false);
}
/**
* Returns all of the variant contexts that start at the current location
* @param ref
* @param curLocation
* @return
*/
public Collection<VariantContext> getAllVariantContexts(ReferenceContext ref, GenomeLoc curLocation) {
return getAllVariantContexts(ref, null, curLocation, true, false);
}
/**
* Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only
* VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation

View File

@ -23,7 +23,7 @@ import java.util.*;
* a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as
* quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
*/
public class VCF4Codec implements FeatureCodec {
public class VCF4Codec implements FeatureCodec, NameAwareCodec {
// a variant context flag for original allele strings
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
@ -62,7 +62,7 @@ public class VCF4Codec implements FeatureCodec {
private final boolean validateFromHeader = false;
// we store a name to give to each of the variant contexts we emit
private String name = "Unkown";
private String name = "Unknown";
private int lineNo = 0;
@ -224,12 +224,12 @@ public class VCF4Codec implements FeatureCodec {
String[] split = str.split(",");
for (String substring : split) {
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
objects.add(type.convert(substring));
objects.add(type != null ? type.convert(substring) : substring);
}
value = objects;
} else {
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
value = type.convert(str);
value = type != null ? type.convert(str) : str;
}
//System.out.printf("%s %s%n", key, value);
} else {
@ -366,7 +366,7 @@ public class VCF4Codec implements FeatureCodec {
* @return a variant context object
*/
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
try {
// try {
// increment the line count
lineNo++;
@ -403,9 +403,10 @@ public class VCF4Codec implements FeatureCodec {
}
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
} catch ( Exception e ) {
throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
}
// } catch ( Exception e ) {
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
// }
}
/**
@ -545,7 +546,7 @@ public class VCF4Codec implements FeatureCodec {
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
return this.header;
}
/**
* get the name of this codec
* @return our set name

View File

@ -120,4 +120,8 @@ public abstract class RMDTrack {
public <HeaderType> HeaderType getHeader(Class<HeaderType> clazz) throws ClassCastException {
return null;
}
public Object getHeader() {
return null;
}
}

View File

@ -32,6 +32,7 @@ import org.broad.tribble.index.Index;
import org.broad.tribble.index.linear.LinearIndex;
import org.broad.tribble.index.linear.LinearIndexCreator;
import org.broad.tribble.readers.BasicFeatureReader;
import org.broad.tribble.vcf.NameAwareCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
@ -95,9 +96,13 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
@Override
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
// return a feature reader track
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, inputFile);
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile);
return new TribbleTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, pair.first, pair.second);
return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second);
}
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
return createFeatureReader(targetClass, "anonymous", inputFile);
}
/**
@ -106,12 +111,12 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
* @param inputFile the input file to create the track from (of the codec type)
* @return the FeatureReader instance
*/
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, String name, File inputFile) {
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = null;
if (inputFile.getAbsolutePath().endsWith(".gz"))
pair = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile);
pair = createBasicFeatureReaderNoAssumedIndex(targetClass, name, inputFile);
else
pair = getLinearFeatureReader(targetClass, inputFile);
pair = getLinearFeatureReader(targetClass, name, inputFile);
return pair;
}
@ -124,27 +129,34 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
* @param inputFile the file to load
* @return a feature reader implementation
*/
private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) {
private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, String name, File inputFile) {
// we might not know the index type, try loading with the default reader constructor
logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file");
try {
return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)),null);
return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null);
} catch (IOException e) {
throw new StingException("Unable to create feature reader from file " + inputFile);
}
}
private FeatureCodec createCodec(Class targetClass, String name) {
FeatureCodec codex = this.createByType(targetClass);
if ( codex instanceof NameAwareCodec )
((NameAwareCodec)codex).setName(name);
return codex;
}
/**
* create a linear feature reader, where we create the index ahead of time
* @param targetClass the target class
* @param inputFile the tribble file to parse
* @return the input file as a FeatureReader
*/
private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, File inputFile) {
private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
Pair<BasicFeatureReader, SAMSequenceDictionary> reader;
try {
Index index = loadIndex(inputFile, this.createByType(targetClass), true);
reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)),index.getSequenceDictionary());
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary());
} catch (FileNotFoundException e) {
throw new StingException("Unable to create reader with file " + inputFile, e);
} catch (IOException e) {

View File

@ -27,21 +27,18 @@ package org.broadinstitute.sting.gatk.walkers.vcf;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.util.*;
/**
@ -51,191 +48,86 @@ import java.util.*;
* priority list (if provided), emits a single record instance at every position represented in the rods.
*/
@Requires(value={})
public class VCFCombine extends RodWalker<VCFRecord, VCFWriter> {
public class VCFCombine extends RodWalker<Integer, Integer> {
// the types of combinations we currently allow
public enum COMBINATION_TYPE {
UNION, MERGE
}
public enum ComboType { UNION, MERGE }
@Argument(fullName="combination_type", shortName="type", doc="combination type; MERGE are supported", required=true)
protected ComboType COMBO_TYPE;
@Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false)
protected File OUTPUT_FILE = null;
@Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true)
protected COMBINATION_TYPE COMBO_TYPE;
@Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; a complete priority list MUST be provided", required=false)
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
protected String PRIORITY_STRING = null;
@Argument(fullName="annotateUnion", shortName="A", doc="For the UNION combination type: if provided, the output union VCF will contain venn information about where each call came from", required=false)
protected boolean annotateUnion = false;
private VCFWriter vcfWriter = null;
private String[] priority = null;
// a map of rod name to uniquified sample name
private HashMap<Pair<String, String>, String> rodNamesToSampleNames;
private List<String> priority = null;
protected EnumSet<VariantContextUtils.MergeType> mergeOptions;
protected final static EnumSet<VariantContextUtils.MergeType> mergeTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES);
protected final static EnumSet<VariantContextUtils.MergeType> unionTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.PRIORITIZE_GENOTYPES);
public void initialize() {
//Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
//hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
vcfWriter = new VCFWriter(out);
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
validateAnnotateUnionArguments(priority);
mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VCF"));
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
if ( OUTPUT_FILE != null )
vcfWriter = new VCFWriter(OUTPUT_FILE);
else
vcfWriter = new VCFWriter(out);
if ( PRIORITY_STRING != null )
priority = PRIORITY_STRING.split(",");
Set<String> samples;
switch (COMBO_TYPE ) {
case MERGE:
samples = new TreeSet<String>();
rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
break;
case UNION:
if ( annotateUnion ) {
validateAnnotateUnionArguments(priority);
}
samples = SampleUtils.getUniqueSamplesFromRods(getToolkit());
break;
default:
throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE);
}
vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
metaData.add(new VCFHeaderLine("source", "VCFCombine"));
vcfWriter.writeHeader(new VCFHeader(metaData, samples));
}
private void validateAnnotateUnionArguments(String[] priority) {
Set<RMDTrack> rods = VCFUtils.getRodVCFs(getToolkit());
if ( priority == null || rods.size() != priority.length ) {
throw new StingException("A complete priority list must be provided when annotateUnion is provided");
}
if ( priority.length != 2 ) {
throw new StingException("When annotateUnion is provided only 2 VCF files can be merged");
}
for ( String p : priority ) {
boolean good = false;
for ( RMDTrack data : rods ) {
if ( p.equals(data.getName()) )
good = true;
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
Set<String> samples = new HashSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) {
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOptions.contains(VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES)));
}
if ( ! good ) throw new StingException("Priority item not provided as a ROD! " + p);
}
return samples;
}
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
private void validateAnnotateUnionArguments(List<String> priority) {
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
if ( priority == null || rodNames.size() != priority.size() )
throw new StingException("A complete priority list must be provided when annotateUnion is provided");
if ( ! rodNames.containsAll(rodNames) )
throw new StingException("Not all priority elements provided as input RODs: " + PRIORITY_STRING);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return null;
return 0;
// get all of the vcf rods at this locus
Map<VCFRecord, String> vcfRods = new LinkedHashMap<VCFRecord,String>();
Iterator<GATKFeature> rods = tracker.getAllRods().iterator();
while (rods.hasNext()) {
GATKFeature feat = rods.next();
Object rod = feat.getUnderlyingObject();
if ( rod instanceof VCFRecord )
vcfRods.put((VCFRecord)rod,feat.getName());
}
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
if ( mergedVC != null ) // only operate at the start of events
if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
vcfWriter.add(mergedVC, ref.getBases());
else
logger.info(String.format("Ignoring complex event: " + mergedVC));
if ( vcfRods.size() == 0 )
return null;
VCFRecord record = null;
switch (COMBO_TYPE ) {
case MERGE:
record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
break;
case UNION:
record = vcfUnion(vcfRods);
break;
default:
break;
}
return record;
return vcs.isEmpty() ? 0 : 1;
}
public VCFWriter reduceInit() {
return vcfWriter;
public Integer reduceInit() {
return 0;
}
private VCFRecord vcfUnion(Map<VCFRecord, String> rods) {
if ( priority == null )
return rods.keySet().iterator().next();
if ( annotateUnion ) {
Map<String, VCFRecord> rodMap = new HashMap<String, VCFRecord>();
for ( VCFRecord vcf : rods.keySet() ) {
rodMap.put(rods.get(vcf),vcf);
}
String priority1 = priority[0];
String priority2 = priority[1];
VCFRecord vcf1 = rodMap.containsKey(priority1) ? rodMap.get(priority1) : null;
VCFRecord vcf2 = rodMap.containsKey(priority2) ? rodMap.get(priority2) : null;
// for simplicity, we are setting set and call for vcf1
String set = priority1;
VCFRecord call = vcf1;
if ( vcf1 == null ) {
if ( vcf2 == null )
//return null;
throw new StingException("BUG: VCF1 and VCF2 are both null!");
else {
set = priority2;
call = vcf2;
}
} else if ( vcf1.isFiltered() ) {
if ( vcf2 != null ) {
if ( vcf2.isFiltered() ) {
set = "filteredInBoth";
} else {
set = priority2 + "-filteredInOther";
call = vcf2;
}
}
} else { // good call
if ( vcf2 != null ) {
if ( vcf2.isFiltered() )
set = priority1 + "-filteredInOther";
else
set = "Intersection";
}
}
call.addInfoField("set", set);
return call;
} else {
for ( String rodname : priority ) {
for ( VCFRecord rod : rods.keySet() ) {
if ( rods.get(rod).equals(rodname) )
return rod;
}
}
}
return null;
public Integer reduce(Integer counter, Integer sum) {
return counter + sum;
}
public VCFWriter reduce(VCFRecord record, VCFWriter writer) {
if ( record != null )
writer.addRecord(record);
return writer;
}
public void onTraversalDone(VCFWriter writer) {
if ( writer != null )
writer.close();
public void onTraversalDone(Integer sum) {
if ( vcfWriter != null )
vcfWriter.close();
}
}

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
@ -87,6 +86,19 @@ public class SampleUtils {
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
Set<String> samples = new TreeSet<String>();
for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() )
samples.addAll(header.getGenotypeSamples());
return samples;
}
public static Set<String> getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
return getRodsWithVCFHeader(toolkit, rodNames).keySet();
}
public static Map<String, VCFHeader> getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
@ -95,13 +107,23 @@ public class SampleUtils {
continue;
RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getRecordType().equals(VCFRecord.class) )
samples.addAll(rod.getHeader(VCFHeader.class).getGenotypeSamples());
if ( containsVCFHeader(rod) )
data.put(rod.getName(), rod.getHeader(VCFHeader.class));
}
return samples;
return data;
}
// todo -- remove when we can actually just get the header itself from tribble
private static boolean containsVCFHeader(RMDTrack rod) {
try {
return rod.getHeader(VCFHeader.class) != null;
} catch ( ClassCastException e ) {
return false;
}
}
/**
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
* (e.g. sampleX.1, sampleX.2, ...)
@ -118,14 +140,11 @@ public class SampleUtils {
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getRecordType().equals(VCFRecord.class) ) {
Set<String> vcfSamples = rod.getHeader(VCFHeader.class).getGenotypeSamples();
for ( String sample : vcfSamples )
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
}
for ( Map.Entry<String, VCFHeader> pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) {
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
for ( String sample : vcfSamples )
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
}
}