Preliminary commit of new VCFCombine, soon to be called CombineVariants (next commit) that support merging any number of VCF files via a general VC merge routine that support prioritization and merging of samples! It's now possible to merge the pilot1/2/3 call sets into a single (monster) VCF taking genotypes from pilot2, then pilot3, then pilot1 as needed.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3690 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a46e22ed13
commit
b8d6a95e7a
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broad.tribble.vcf;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 29, 2010
|
||||
* Time: 3:48:47 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public interface NameAwareCodec {
|
||||
public String getName();
|
||||
public void setName(String name);
|
||||
}
|
||||
|
|
@ -12,6 +12,11 @@ public class MutableGenotype extends Genotype {
|
|||
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
|
||||
}
|
||||
|
||||
public MutableGenotype(String sampleName, Genotype parent) {
|
||||
super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
|
||||
}
|
||||
|
||||
|
||||
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
|
||||
super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ import java.util.*;
|
|||
import org.apache.commons.jexl2.*;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
|
||||
|
|
@ -158,34 +160,59 @@ public class VariantContextUtils {
|
|||
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
|
||||
}
|
||||
|
||||
public enum MergeType {
|
||||
UNION_VARIANTS, INTERSECT_VARIANTS, UNIQUIFY_GENOTYPES, PRIORITIZE_GENOTYPES, UNSORTED_GENOTYPES
|
||||
}
|
||||
|
||||
public static VariantContext simpleMerge(Set<VariantContext> VCs) {
|
||||
if ( VCs == null || VCs.size() == 0 )
|
||||
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
|
||||
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
||||
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||
* the sample name
|
||||
*
|
||||
* @param unsortedVCs
|
||||
* @param priorityListOfVCs
|
||||
* @param mergeOptions
|
||||
* @return
|
||||
*/
|
||||
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
|
||||
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
|
||||
return null;
|
||||
|
||||
Iterator<VariantContext> iter = VCs.iterator();
|
||||
if ( annotateOrigin && priorityListOfVCs == null )
|
||||
throw new IllegalArgumentException("Cannot merge calls and annotate their origins with a complete priority list of VariantContexts");
|
||||
|
||||
List<VariantContext> VCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, mergeOptions);
|
||||
|
||||
// establish the baseline info from the first VC
|
||||
VariantContext first = iter.next();
|
||||
VariantContext first = VCs.get(0);
|
||||
String name = first.getName();
|
||||
GenomeLoc loc = first.getLocation();
|
||||
Set<Allele> alleles = new HashSet<Allele>(first.getAlleles());
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(first.getGenotypes());
|
||||
double negLog10PError = first.isVariant() ? first.getNegLog10PError() : -1;
|
||||
Set<String> filters = new HashSet<String>(first.getFilters());
|
||||
|
||||
Set<Allele> alleles = new HashSet<Allele>();
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
double negLog10PError = -1;
|
||||
Set<String> filters = new HashSet<String>();
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
int depth = 0;
|
||||
if ( first.hasAttribute(VCFRecord.DEPTH_KEY) )
|
||||
depth = Integer.valueOf(first.getAttribute(VCFRecord.DEPTH_KEY).toString());
|
||||
|
||||
// filtering values
|
||||
int nFiltered = 0;
|
||||
|
||||
// cycle through and add info from the other VCs, making sure the loc/reference matches
|
||||
while ( iter.hasNext() ) {
|
||||
VariantContext vc = iter.next();
|
||||
if ( !loc.equals(vc.getLocation()) || !first.getReference().equals(vc.getReference()) )
|
||||
return null;
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) )
|
||||
throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
|
||||
|
||||
nFiltered += vc.isFiltered() ? 1 : 0;
|
||||
|
||||
alleles.addAll(vc.getAlleles());
|
||||
genotypes.putAll(vc.getGenotypes());
|
||||
|
||||
mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));
|
||||
|
||||
negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1);
|
||||
|
||||
|
|
@ -194,8 +221,74 @@ public class VariantContextUtils {
|
|||
depth += Integer.valueOf(vc.getAttribute(VCFRecord.DEPTH_KEY).toString());
|
||||
}
|
||||
|
||||
// if at least one record was unfiltered and we want a union, clear all of the filters
|
||||
if ( mergeOptions.contains(MergeType.UNION_VARIANTS) && nFiltered != VCs.size() )
|
||||
filters.clear();
|
||||
|
||||
// we care about where the call came from
|
||||
if ( annotateOrigin ) {
|
||||
String setValue = "";
|
||||
if ( nFiltered == 0 && VCs.size() == priorityListOfVCs.size() ) // nothing was unfiltered
|
||||
setValue = "Intersection";
|
||||
else if ( nFiltered == VCs.size() ) // everything was filtered out
|
||||
setValue = "FilteredInAll";
|
||||
else { // we are filtered in some subset
|
||||
List<String> s = new ArrayList<String>();
|
||||
for ( VariantContext vc : VCs )
|
||||
s.add( vc.isFiltered() ? "filterIn" + vc.getName() : vc.getName() );
|
||||
setValue = Utils.join("-", s);
|
||||
}
|
||||
|
||||
attributes.put("set", setValue);
|
||||
}
|
||||
|
||||
if ( depth > 0 )
|
||||
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
|
||||
return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
}
|
||||
|
||||
static class CompareByPriority implements Comparator<VariantContext> {
|
||||
List<String> priorityListOfVCs;
|
||||
public CompareByPriority(List<String> priorityListOfVCs) {
|
||||
this.priorityListOfVCs = priorityListOfVCs;
|
||||
}
|
||||
|
||||
private int getIndex(VariantContext vc) {
|
||||
int i = priorityListOfVCs.indexOf(vc.getName());
|
||||
if ( i == -1 ) throw new StingException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getName());
|
||||
return i;
|
||||
}
|
||||
|
||||
public int compare(VariantContext vc1, VariantContext vc2) {
|
||||
return new Integer(getIndex(vc1)).compareTo(getIndex(vc2));
|
||||
}
|
||||
}
|
||||
|
||||
public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions ) {
|
||||
if ( mergeOptions.contains(MergeType.PRIORITIZE_GENOTYPES) && priorityListOfVCs == null )
|
||||
throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
|
||||
|
||||
if ( priorityListOfVCs == null || mergeOptions.contains(MergeType.UNSORTED_GENOTYPES) )
|
||||
return new ArrayList<VariantContext>(unsortedVCs);
|
||||
else {
|
||||
ArrayList<VariantContext> sorted = new ArrayList<VariantContext>(unsortedVCs);
|
||||
Collections.sort(sorted, new CompareByPriority(priorityListOfVCs));
|
||||
return sorted;
|
||||
}
|
||||
}
|
||||
|
||||
private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) {
|
||||
for ( Genotype g : oneVC.getGenotypes().values() ) {
|
||||
String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples);
|
||||
if ( ! mergedGenotypes.containsKey(name) ) {
|
||||
// only add if the name is new
|
||||
Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g);
|
||||
mergedGenotypes.put(name, newG);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) {
|
||||
return uniqify ? sampleName + "." + trackName : sampleName;
|
||||
}
|
||||
}
|
||||
|
|
@ -199,6 +199,16 @@ public class RefMetaDataTracker {
|
|||
return getAllVariantContexts(ref, null, null, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all of the variant contexts that start at the current location
|
||||
* @param ref
|
||||
* @param curLocation
|
||||
* @return
|
||||
*/
|
||||
public Collection<VariantContext> getAllVariantContexts(ReferenceContext ref, GenomeLoc curLocation) {
|
||||
return getAllVariantContexts(ref, null, curLocation, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only
|
||||
* VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import java.util.*;
|
|||
* a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as
|
||||
* quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
|
||||
*/
|
||||
public class VCF4Codec implements FeatureCodec {
|
||||
public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||
|
||||
// a variant context flag for original allele strings
|
||||
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
|
||||
|
|
@ -62,7 +62,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
private final boolean validateFromHeader = false;
|
||||
|
||||
// we store a name to give to each of the variant contexts we emit
|
||||
private String name = "Unkown";
|
||||
private String name = "Unknown";
|
||||
|
||||
private int lineNo = 0;
|
||||
|
||||
|
|
@ -224,12 +224,12 @@ public class VCF4Codec implements FeatureCodec {
|
|||
String[] split = str.split(",");
|
||||
for (String substring : split) {
|
||||
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||
objects.add(type.convert(substring));
|
||||
objects.add(type != null ? type.convert(substring) : substring);
|
||||
}
|
||||
value = objects;
|
||||
} else {
|
||||
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||
value = type.convert(str);
|
||||
value = type != null ? type.convert(str) : str;
|
||||
}
|
||||
//System.out.printf("%s %s%n", key, value);
|
||||
} else {
|
||||
|
|
@ -366,7 +366,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
* @return a variant context object
|
||||
*/
|
||||
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
||||
try {
|
||||
// try {
|
||||
// increment the line count
|
||||
lineNo++;
|
||||
|
||||
|
|
@ -403,9 +403,10 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
|
||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
} catch ( Exception e ) {
|
||||
throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
|
||||
}
|
||||
// } catch ( Exception e ) {
|
||||
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
|
||||
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
|
||||
// }
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -545,7 +546,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
|
||||
return this.header;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get the name of this codec
|
||||
* @return our set name
|
||||
|
|
|
|||
|
|
@ -120,4 +120,8 @@ public abstract class RMDTrack {
|
|||
public <HeaderType> HeaderType getHeader(Class<HeaderType> clazz) throws ClassCastException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Object getHeader() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -32,6 +32,7 @@ import org.broad.tribble.index.Index;
|
|||
import org.broad.tribble.index.linear.LinearIndex;
|
||||
import org.broad.tribble.index.linear.LinearIndexCreator;
|
||||
import org.broad.tribble.readers.BasicFeatureReader;
|
||||
import org.broad.tribble.vcf.NameAwareCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
|
|
@ -95,9 +96,13 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
|
|||
@Override
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
// return a feature reader track
|
||||
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, inputFile);
|
||||
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
|
||||
if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile);
|
||||
return new TribbleTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, pair.first, pair.second);
|
||||
return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second);
|
||||
}
|
||||
|
||||
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
|
||||
return createFeatureReader(targetClass, "anonymous", inputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -106,12 +111,12 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
|
|||
* @param inputFile the input file to create the track from (of the codec type)
|
||||
* @return the FeatureReader instance
|
||||
*/
|
||||
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
|
||||
public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureReader, SAMSequenceDictionary> pair = null;
|
||||
if (inputFile.getAbsolutePath().endsWith(".gz"))
|
||||
pair = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile);
|
||||
pair = createBasicFeatureReaderNoAssumedIndex(targetClass, name, inputFile);
|
||||
else
|
||||
pair = getLinearFeatureReader(targetClass, inputFile);
|
||||
pair = getLinearFeatureReader(targetClass, name, inputFile);
|
||||
return pair;
|
||||
}
|
||||
|
||||
|
|
@ -124,27 +129,34 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
|
|||
* @param inputFile the file to load
|
||||
* @return a feature reader implementation
|
||||
*/
|
||||
private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) {
|
||||
private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, String name, File inputFile) {
|
||||
// we might not know the index type, try loading with the default reader constructor
|
||||
logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file");
|
||||
try {
|
||||
return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)),null);
|
||||
return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null);
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to create feature reader from file " + inputFile);
|
||||
}
|
||||
}
|
||||
|
||||
private FeatureCodec createCodec(Class targetClass, String name) {
|
||||
FeatureCodec codex = this.createByType(targetClass);
|
||||
if ( codex instanceof NameAwareCodec )
|
||||
((NameAwareCodec)codex).setName(name);
|
||||
return codex;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a linear feature reader, where we create the index ahead of time
|
||||
* @param targetClass the target class
|
||||
* @param inputFile the tribble file to parse
|
||||
* @return the input file as a FeatureReader
|
||||
*/
|
||||
private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, File inputFile) {
|
||||
private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureReader, SAMSequenceDictionary> reader;
|
||||
try {
|
||||
Index index = loadIndex(inputFile, this.createByType(targetClass), true);
|
||||
reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)),index.getSequenceDictionary());
|
||||
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
|
||||
reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary());
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Unable to create reader with file " + inputFile, e);
|
||||
} catch (IOException e) {
|
||||
|
|
|
|||
|
|
@ -27,21 +27,18 @@ package org.broadinstitute.sting.gatk.walkers.vcf;
|
|||
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -51,191 +48,86 @@ import java.util.*;
|
|||
* priority list (if provided), emits a single record instance at every position represented in the rods.
|
||||
*/
|
||||
@Requires(value={})
|
||||
public class VCFCombine extends RodWalker<VCFRecord, VCFWriter> {
|
||||
public class VCFCombine extends RodWalker<Integer, Integer> {
|
||||
// the types of combinations we currently allow
|
||||
public enum COMBINATION_TYPE {
|
||||
UNION, MERGE
|
||||
}
|
||||
public enum ComboType { UNION, MERGE }
|
||||
@Argument(fullName="combination_type", shortName="type", doc="combination type; MERGE are supported", required=true)
|
||||
protected ComboType COMBO_TYPE;
|
||||
|
||||
@Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false)
|
||||
protected File OUTPUT_FILE = null;
|
||||
|
||||
@Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true)
|
||||
protected COMBINATION_TYPE COMBO_TYPE;
|
||||
|
||||
@Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; a complete priority list MUST be provided", required=false)
|
||||
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
|
||||
protected String PRIORITY_STRING = null;
|
||||
|
||||
@Argument(fullName="annotateUnion", shortName="A", doc="For the UNION combination type: if provided, the output union VCF will contain venn information about where each call came from", required=false)
|
||||
protected boolean annotateUnion = false;
|
||||
|
||||
private VCFWriter vcfWriter = null;
|
||||
private String[] priority = null;
|
||||
|
||||
// a map of rod name to uniquified sample name
|
||||
private HashMap<Pair<String, String>, String> rodNamesToSampleNames;
|
||||
private List<String> priority = null;
|
||||
protected EnumSet<VariantContextUtils.MergeType> mergeOptions;
|
||||
|
||||
protected final static EnumSet<VariantContextUtils.MergeType> mergeTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES);
|
||||
protected final static EnumSet<VariantContextUtils.MergeType> unionTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.PRIORITIZE_GENOTYPES);
|
||||
|
||||
public void initialize() {
|
||||
|
||||
//Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
//hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
|
||||
vcfWriter = new VCFWriter(out);
|
||||
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
|
||||
|
||||
validateAnnotateUnionArguments(priority);
|
||||
mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
|
||||
Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
|
||||
|
||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
metaData.add(new VCFHeaderLine("source", "VCF"));
|
||||
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
|
||||
if ( OUTPUT_FILE != null )
|
||||
vcfWriter = new VCFWriter(OUTPUT_FILE);
|
||||
else
|
||||
vcfWriter = new VCFWriter(out);
|
||||
|
||||
if ( PRIORITY_STRING != null )
|
||||
priority = PRIORITY_STRING.split(",");
|
||||
|
||||
Set<String> samples;
|
||||
switch (COMBO_TYPE ) {
|
||||
case MERGE:
|
||||
samples = new TreeSet<String>();
|
||||
rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
|
||||
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
|
||||
break;
|
||||
case UNION:
|
||||
if ( annotateUnion ) {
|
||||
validateAnnotateUnionArguments(priority);
|
||||
}
|
||||
|
||||
samples = SampleUtils.getUniqueSamplesFromRods(getToolkit());
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE);
|
||||
}
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||
metaData.add(new VCFHeaderLine("source", "VCFCombine"));
|
||||
vcfWriter.writeHeader(new VCFHeader(metaData, samples));
|
||||
}
|
||||
|
||||
private void validateAnnotateUnionArguments(String[] priority) {
|
||||
Set<RMDTrack> rods = VCFUtils.getRodVCFs(getToolkit());
|
||||
if ( priority == null || rods.size() != priority.length ) {
|
||||
throw new StingException("A complete priority list must be provided when annotateUnion is provided");
|
||||
}
|
||||
if ( priority.length != 2 ) {
|
||||
throw new StingException("When annotateUnion is provided only 2 VCF files can be merged");
|
||||
}
|
||||
|
||||
for ( String p : priority ) {
|
||||
boolean good = false;
|
||||
for ( RMDTrack data : rods ) {
|
||||
if ( p.equals(data.getName()) )
|
||||
good = true;
|
||||
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
|
||||
Set<String> samples = new HashSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOptions.contains(VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES)));
|
||||
}
|
||||
if ( ! good ) throw new StingException("Priority item not provided as a ROD! " + p);
|
||||
}
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
private void validateAnnotateUnionArguments(List<String> priority) {
|
||||
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
|
||||
if ( priority == null || rodNames.size() != priority.size() )
|
||||
throw new StingException("A complete priority list must be provided when annotateUnion is provided");
|
||||
|
||||
if ( ! rodNames.containsAll(rodNames) )
|
||||
throw new StingException("Not all priority elements provided as input RODs: " + PRIORITY_STRING);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return null;
|
||||
return 0;
|
||||
|
||||
// get all of the vcf rods at this locus
|
||||
Map<VCFRecord, String> vcfRods = new LinkedHashMap<VCFRecord,String>();
|
||||
Iterator<GATKFeature> rods = tracker.getAllRods().iterator();
|
||||
while (rods.hasNext()) {
|
||||
GATKFeature feat = rods.next();
|
||||
Object rod = feat.getUnderlyingObject();
|
||||
if ( rod instanceof VCFRecord )
|
||||
vcfRods.put((VCFRecord)rod,feat.getName());
|
||||
}
|
||||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
|
||||
if ( mergedVC != null ) // only operate at the start of events
|
||||
if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
|
||||
vcfWriter.add(mergedVC, ref.getBases());
|
||||
else
|
||||
logger.info(String.format("Ignoring complex event: " + mergedVC));
|
||||
|
||||
if ( vcfRods.size() == 0 )
|
||||
return null;
|
||||
|
||||
VCFRecord record = null;
|
||||
switch (COMBO_TYPE ) {
|
||||
case MERGE:
|
||||
record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
|
||||
break;
|
||||
case UNION:
|
||||
record = vcfUnion(vcfRods);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return record;
|
||||
return vcs.isEmpty() ? 0 : 1;
|
||||
}
|
||||
|
||||
public VCFWriter reduceInit() {
|
||||
return vcfWriter;
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
private VCFRecord vcfUnion(Map<VCFRecord, String> rods) {
|
||||
if ( priority == null )
|
||||
return rods.keySet().iterator().next();
|
||||
|
||||
if ( annotateUnion ) {
|
||||
Map<String, VCFRecord> rodMap = new HashMap<String, VCFRecord>();
|
||||
for ( VCFRecord vcf : rods.keySet() ) {
|
||||
rodMap.put(rods.get(vcf),vcf);
|
||||
}
|
||||
|
||||
String priority1 = priority[0];
|
||||
String priority2 = priority[1];
|
||||
VCFRecord vcf1 = rodMap.containsKey(priority1) ? rodMap.get(priority1) : null;
|
||||
VCFRecord vcf2 = rodMap.containsKey(priority2) ? rodMap.get(priority2) : null;
|
||||
|
||||
// for simplicity, we are setting set and call for vcf1
|
||||
String set = priority1;
|
||||
VCFRecord call = vcf1;
|
||||
|
||||
if ( vcf1 == null ) {
|
||||
if ( vcf2 == null )
|
||||
//return null;
|
||||
throw new StingException("BUG: VCF1 and VCF2 are both null!");
|
||||
else {
|
||||
set = priority2;
|
||||
call = vcf2;
|
||||
}
|
||||
} else if ( vcf1.isFiltered() ) {
|
||||
if ( vcf2 != null ) {
|
||||
if ( vcf2.isFiltered() ) {
|
||||
set = "filteredInBoth";
|
||||
} else {
|
||||
set = priority2 + "-filteredInOther";
|
||||
call = vcf2;
|
||||
}
|
||||
}
|
||||
} else { // good call
|
||||
if ( vcf2 != null ) {
|
||||
if ( vcf2.isFiltered() )
|
||||
set = priority1 + "-filteredInOther";
|
||||
else
|
||||
set = "Intersection";
|
||||
}
|
||||
}
|
||||
|
||||
call.addInfoField("set", set);
|
||||
return call;
|
||||
} else {
|
||||
for ( String rodname : priority ) {
|
||||
for ( VCFRecord rod : rods.keySet() ) {
|
||||
if ( rods.get(rod).equals(rodname) )
|
||||
return rod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
||||
public VCFWriter reduce(VCFRecord record, VCFWriter writer) {
|
||||
if ( record != null )
|
||||
writer.addRecord(record);
|
||||
return writer;
|
||||
}
|
||||
|
||||
public void onTraversalDone(VCFWriter writer) {
|
||||
if ( writer != null )
|
||||
writer.close();
|
||||
public void onTraversalDone(Integer sum) {
|
||||
if ( vcfWriter != null )
|
||||
vcfWriter.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
|
|
@ -87,6 +86,19 @@ public class SampleUtils {
|
|||
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
|
||||
for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() )
|
||||
samples.addAll(header.getGenotypeSamples());
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
public static Set<String> getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
return getRodsWithVCFHeader(toolkit, rodNames).keySet();
|
||||
}
|
||||
|
||||
public static Map<String, VCFHeader> getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
|
|
@ -95,13 +107,23 @@ public class SampleUtils {
|
|||
continue;
|
||||
|
||||
RMDTrack rod = source.getReferenceOrderedData();
|
||||
if ( rod.getRecordType().equals(VCFRecord.class) )
|
||||
samples.addAll(rod.getHeader(VCFHeader.class).getGenotypeSamples());
|
||||
if ( containsVCFHeader(rod) )
|
||||
data.put(rod.getName(), rod.getHeader(VCFHeader.class));
|
||||
}
|
||||
|
||||
return samples;
|
||||
return data;
|
||||
}
|
||||
|
||||
// todo -- remove when we can actually just get the header itself from tribble
|
||||
private static boolean containsVCFHeader(RMDTrack rod) {
|
||||
try {
|
||||
return rod.getHeader(VCFHeader.class) != null;
|
||||
} catch ( ClassCastException e ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
|
|
@ -118,14 +140,11 @@ public class SampleUtils {
|
|||
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
RMDTrack rod = source.getReferenceOrderedData();
|
||||
if ( rod.getRecordType().equals(VCFRecord.class) ) {
|
||||
Set<String> vcfSamples = rod.getHeader(VCFHeader.class).getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, VCFHeader> pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) {
|
||||
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue