merging now very close to working. Bug todo in writer and vcf infrastructure. Can almost create merged snp and indel files
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3712 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b6bdd61283
commit
cd2e4b0a1e
|
|
@ -14,7 +14,7 @@ import java.util.Map;
|
|||
* <p/>
|
||||
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
|
||||
*/
|
||||
public class VCFFormatHeaderLine extends VCFHeaderLine {
|
||||
public class VCFFormatHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
|
||||
// the format field types
|
||||
public enum FORMAT_TYPE {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -48,6 +50,9 @@ class VCF4Parser implements VCFLineParser {
|
|||
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||
if (start) start = false;
|
||||
else builder.append(",");
|
||||
|
||||
if ( entry.getValue() == null ) throw new StingException("Header problem: unbound value at " + entry + " from " + keyValues);
|
||||
|
||||
builder.append(entry.getKey());
|
||||
builder.append("=");
|
||||
builder.append(entry.getValue().toString().contains(",") ||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
|
@ -14,7 +12,7 @@ import java.util.Map;
|
|||
* <p/>
|
||||
* A class representing a key=value entry for INFO fields in the VCF header
|
||||
*/
|
||||
public class VCFInfoHeaderLine extends VCFHeaderLine {
|
||||
public class VCFInfoHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
|
||||
// the info field types
|
||||
public enum INFO_TYPE {
|
||||
|
|
@ -106,6 +104,7 @@ public class VCFInfoHeaderLine extends VCFHeaderLine {
|
|||
mType == other.mType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getmName() {
|
||||
return mName;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broad.tribble.vcf;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jul 2, 2010
|
||||
* Time: 2:40:45 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public interface VCFNamedHeaderLine {
|
||||
String getmName();
|
||||
}
|
||||
|
|
@ -175,6 +175,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
else if(inputFile.getName().toLowerCase().endsWith(".bam")) {
|
||||
unpackedReads.add( inputFile );
|
||||
}
|
||||
else if(inputFile.getName().equals("-")) {
|
||||
unpackedReads.add( new File("/dev/stdin") );
|
||||
}
|
||||
else {
|
||||
Utils.scareUser(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
|
||||
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +
|
||||
|
|
|
|||
|
|
@ -728,9 +728,9 @@ public class GenomeAnalysisEngine {
|
|||
ValidationExclusion exclusions) {
|
||||
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
|
||||
// sharding system; it's required with the new sharding system only for locus walkers.
|
||||
if(readsDataSource != null && !readsDataSource.hasIndex() && walker instanceof LocusWalker) {
|
||||
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
|
||||
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM) || intervals != null)
|
||||
throw new StingException("The GATK cannot currently process unindexed BAM files");
|
||||
throw new StingException("The GATK cannot currently process unindexed BAM files without the -U ALLOW_UNINDEXED_BAM, or with unindexed BAM files with the -L option");
|
||||
|
||||
Shard.ShardType shardType;
|
||||
if(walker instanceof LocusWalker) {
|
||||
|
|
|
|||
|
|
@ -165,7 +165,7 @@ public class VariantContextUtils {
|
|||
}
|
||||
|
||||
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
|
||||
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
|
||||
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false, false);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -179,7 +179,7 @@ public class VariantContextUtils {
|
|||
* @param mergeOptions
|
||||
* @return
|
||||
*/
|
||||
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
|
||||
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin, boolean printMessages ) {
|
||||
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
|
||||
return null;
|
||||
|
||||
|
|
@ -256,7 +256,7 @@ public class VariantContextUtils {
|
|||
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
|
||||
|
||||
VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
//if ( remapped ) System.out.printf("Remapped => %s%n", merged);
|
||||
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
|
||||
return merged;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -434,11 +434,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
||||
|
||||
// todo -- the parsing of attributes could be made lazy for performance
|
||||
Map<String, String> gtAttributes = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
|
|
@ -457,8 +454,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
genotypeAlleleLocation = i;
|
||||
else if (genotypeKeyArray[i].equals("GQ"))
|
||||
GTQual = parseQual(GTValueArray[i]);
|
||||
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
|
||||
genotypeFilters.addAll(parseFilters(GTValueArray[i]));
|
||||
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
||||
genotypeFilters = parseFilters(GTValueArray[i]);
|
||||
else
|
||||
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
|
||||
|
||||
|
|
|
|||
|
|
@ -25,8 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
|
|
@ -57,6 +56,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
|
||||
protected String PRIORITY_STRING = null;
|
||||
|
||||
@Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
|
||||
protected boolean printComplexMerges = false;
|
||||
|
||||
private VCFWriter vcfWriter = null;
|
||||
private List<String> priority = null;
|
||||
protected EnumSet<VariantContextUtils.MergeType> mergeOptions;
|
||||
|
|
@ -72,16 +74,21 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
vcfWriter = new VCFWriter(out, true);
|
||||
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
|
||||
|
||||
|
||||
// todo -- need to merge headers in an intelligent way
|
||||
|
||||
validateAnnotateUnionArguments(priority);
|
||||
mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
|
||||
Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
|
||||
Map<String, VCFHeader> vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null);
|
||||
Set<String> samples = getSampleList(vcfRods, mergeOptions);
|
||||
|
||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
metaData.add(new VCFHeaderLine("source", "VCFCombine"));
|
||||
vcfWriter.writeHeader(new VCFHeader(metaData, samples));
|
||||
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
|
||||
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
}
|
||||
|
||||
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
|
||||
// todo -- Eric, where's a better place to put this?
|
||||
public static Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
|
|
@ -93,6 +100,48 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
return samples;
|
||||
}
|
||||
|
||||
// todo -- Eric, where's a better place to put this?
|
||||
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers) throws IllegalStateException {
|
||||
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
|
||||
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
|
||||
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getmName();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) )
|
||||
continue;
|
||||
// System.out.printf("equals duplicate key %s%n", line);
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getmName();
|
||||
String otherName = ((VCFFilterHeaderLine) other).getmName();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else {
|
||||
//String lineName = ((VCFInfoHeaderLine) line).getmName();
|
||||
//String otherName = ((VCFFilterHeaderLine) other).getmName();
|
||||
|
||||
// todo -- aaron, please complete these comparisons when INFO and Format header lines are made into one
|
||||
//if ( (lineType != null && ! lineType.equals(otherType)) || (lineCount != null && !lineCounts.equals(otherCount)))
|
||||
// throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
}
|
||||
} else {
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new HashSet<VCFHeaderLine>(map.values());
|
||||
}
|
||||
|
||||
|
||||
private void validateAnnotateUnionArguments(List<String> priority) {
|
||||
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
|
||||
if ( priority == null || rodNames.size() != priority.size() )
|
||||
|
|
@ -108,7 +157,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
// get all of the vcf rods at this locus
|
||||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true, printComplexMerges);
|
||||
if ( mergedVC != null ) // only operate at the start of events
|
||||
//if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
|
||||
vcfWriter.add(mergedVC, ref.getBases());
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ public class VCFWriter {
|
|||
BufferedWriter mWriter;
|
||||
|
||||
private boolean writingVCF40Format;
|
||||
private String PASSES_FILTERS_STRING = null;
|
||||
|
||||
// our genotype sample fields
|
||||
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
|
|
@ -47,8 +48,8 @@ public class VCFWriter {
|
|||
|
||||
// default values
|
||||
private static final String UNFILTERED = ".";
|
||||
private static final String PASSES_FILTERS = "0";
|
||||
private static final String PASSES_FILTERS_VCF_4_0 = "PASS";
|
||||
private static final String PASSES_FILTERS_VCF3 = "0";
|
||||
private static final String PASSES_FILTERS_VCF4 = "PASS";
|
||||
private static final String EMPTY_INFO_FIELD = ".";
|
||||
private static final String EMPTY_ID_FIELD = ".";
|
||||
private static final String EMPTY_ALLELE_FIELD = ".";
|
||||
|
|
@ -66,12 +67,15 @@ public class VCFWriter {
|
|||
|
||||
public VCFWriter(File location, boolean useVCF4Format) {
|
||||
this.writingVCF40Format = useVCF4Format;
|
||||
this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
|
||||
|
||||
FileOutputStream output;
|
||||
try {
|
||||
output = new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new RuntimeException("Unable to create VCF file at location: " + location);
|
||||
}
|
||||
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(output));
|
||||
}
|
||||
|
||||
|
|
@ -87,6 +91,7 @@ public class VCFWriter {
|
|||
}
|
||||
public VCFWriter(OutputStream output, boolean useVCF4Format) {
|
||||
this.writingVCF40Format = useVCF4Format;
|
||||
this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(output));
|
||||
}
|
||||
|
||||
|
|
@ -238,8 +243,8 @@ public class VCFWriter {
|
|||
// TODO- clean up these flags and associated code
|
||||
boolean filtersWereAppliedToContext = true;
|
||||
List<String> allowedGenotypeAttributeKeys = null;
|
||||
boolean filtersWereAppliedToGenotypes = false;
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
|
||||
boolean filtersWereAppliedToGenotypes = true;
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_STRING : UNFILTERED);
|
||||
|
||||
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
|
||||
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
|
||||
|
|
@ -251,8 +256,6 @@ public class VCFWriter {
|
|||
|
||||
ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");
|
||||
|
||||
|
||||
|
||||
// search for reference allele and find trailing and padding at the end.
|
||||
if (originalAlleles != null) {
|
||||
Allele originalReferenceAllele = null;
|
||||
|
|
@ -339,7 +342,7 @@ public class VCFWriter {
|
|||
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
|
||||
vcfGenotypeAttributeKeys.add(key);
|
||||
}
|
||||
if ( filtersWereAppliedToGenotypes )
|
||||
if ( filtersWereAppliedToGenotypes ) // todo -- should be calculated
|
||||
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
|
||||
}
|
||||
String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
||||
|
|
@ -382,7 +385,7 @@ public class VCFWriter {
|
|||
val = pileup.size();
|
||||
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
|
||||
// VCF 4.0 key for no filters is "."
|
||||
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : UNFILTERED;
|
||||
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS_STRING;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue