merging now very close to working. Bug todo in writer and vcf infrastructure. Can almost create merged snp and indel files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3712 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-02 20:09:25 +00:00
parent b6bdd61283
commit cd2e4b0a1e
10 changed files with 123 additions and 31 deletions

View File

@ -14,7 +14,7 @@ import java.util.Map;
* <p/>
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
*/
public class VCFFormatHeaderLine extends VCFHeaderLine {
public class VCFFormatHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
// the format field types
public enum FORMAT_TYPE {

View File

@ -1,5 +1,7 @@
package org.broad.tribble.vcf;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
/**
@ -48,6 +50,9 @@ class VCF4Parser implements VCFLineParser {
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
if (start) start = false;
else builder.append(",");
if ( entry.getValue() == null ) throw new StingException("Header problem: unbound value at " + entry + " from " + keyValues);
builder.append(entry.getKey());
builder.append("=");
builder.append(entry.getValue().toString().contains(",") ||

View File

@ -1,7 +1,5 @@
package org.broad.tribble.vcf;
import org.broad.tribble.util.ParsingUtils;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
@ -14,7 +12,7 @@ import java.util.Map;
* <p/>
* A class representing a key=value entry for INFO fields in the VCF header
*/
public class VCFInfoHeaderLine extends VCFHeaderLine {
public class VCFInfoHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
// the info field types
public enum INFO_TYPE {
@ -106,6 +104,7 @@ public class VCFInfoHeaderLine extends VCFHeaderLine {
mType == other.mType;
}
@Override
public String getmName() {
return mName;
}

View File

@ -0,0 +1,36 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broad.tribble.vcf;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jul 2, 2010
* Time: 2:40:45 PM
* To change this template use File | Settings | File Templates.
*/
public interface VCFNamedHeaderLine {
String getmName();
}

View File

@ -175,6 +175,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
else if(inputFile.getName().toLowerCase().endsWith(".bam")) {
unpackedReads.add( inputFile );
}
else if(inputFile.getName().equals("-")) {
unpackedReads.add( new File("/dev/stdin") );
}
else {
Utils.scareUser(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +

View File

@ -728,9 +728,9 @@ public class GenomeAnalysisEngine {
ValidationExclusion exclusions) {
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
if(readsDataSource != null && !readsDataSource.hasIndex() && walker instanceof LocusWalker) {
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM) || intervals != null)
throw new StingException("The GATK cannot currently process unindexed BAM files");
throw new StingException("The GATK cannot currently process unindexed BAM files without the -U ALLOW_UNINDEXED_BAM, or with unindexed BAM files with the -L option");
Shard.ShardType shardType;
if(walker instanceof LocusWalker) {

View File

@ -165,7 +165,7 @@ public class VariantContextUtils {
}
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false, false);
}
@ -179,7 +179,7 @@ public class VariantContextUtils {
* @param mergeOptions
* @return
*/
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin, boolean printMessages ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null;
@ -256,7 +256,7 @@ public class VariantContextUtils {
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
//if ( remapped ) System.out.printf("Remapped => %s%n", merged);
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
return merged;
}

View File

@ -434,11 +434,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
String sampleName = sampleNameIterator.next();
// todo -- the parsing of attributes could be made lazy for performance
Map<String, String> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
@ -457,8 +454,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
genotypeAlleleLocation = i;
else if (genotypeKeyArray[i].equals("GQ"))
GTQual = parseQual(GTValueArray[i]);
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
genotypeFilters.addAll(parseFilters(GTValueArray[i]));
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
genotypeFilters = parseFilters(GTValueArray[i]);
else
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);

View File

@ -25,8 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
@ -57,6 +56,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
protected String PRIORITY_STRING = null;
@Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
protected boolean printComplexMerges = false;
private VCFWriter vcfWriter = null;
private List<String> priority = null;
protected EnumSet<VariantContextUtils.MergeType> mergeOptions;
@ -72,16 +74,21 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
vcfWriter = new VCFWriter(out, true);
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
// todo -- need to merge headers in an intelligent way
validateAnnotateUnionArguments(priority);
mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
Map<String, VCFHeader> vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null);
Set<String> samples = getSampleList(vcfRods, mergeOptions);
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VCFCombine"));
vcfWriter.writeHeader(new VCFHeader(metaData, samples));
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
}
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
// todo -- Eric, where's a better place to put this?
public static Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
Set<String> samples = new TreeSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue();
@ -93,6 +100,48 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
return samples;
}
// todo -- Eric, where's a better place to put this?
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers) throws IllegalStateException {
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
for ( VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( VCFHeaderLine line : source.getMetaData()) {
String key = line.getKey();
if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getmName();
if ( map.containsKey(key) ) {
VCFHeaderLine other = map.get(key);
if ( line.equals(other) )
continue;
// System.out.printf("equals duplicate key %s%n", line);
else if ( ! line.getClass().equals(other.getClass()) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
else if ( line instanceof VCFFilterHeaderLine ) {
String lineName = ((VCFFilterHeaderLine) line).getmName();
String otherName = ((VCFFilterHeaderLine) other).getmName();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else {
//String lineName = ((VCFInfoHeaderLine) line).getmName();
//String otherName = ((VCFFilterHeaderLine) other).getmName();
// todo -- aaron, please complete these comparisons when INFO and Format header lines are made into one
//if ( (lineType != null && ! lineType.equals(otherType)) || (lineCount != null && !lineCounts.equals(otherCount)))
// throw new IllegalStateException("Incompatible header types: " + line + " " + other );
}
} else {
map.put(key, line);
//System.out.printf("Adding header line %s%n", line);
}
}
}
return new HashSet<VCFHeaderLine>(map.values());
}
private void validateAnnotateUnionArguments(List<String> priority) {
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
if ( priority == null || rodNames.size() != priority.size() )
@ -108,7 +157,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
// get all of the vcf rods at this locus
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true, printComplexMerges);
if ( mergedVC != null ) // only operate at the start of events
//if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
vcfWriter.add(mergedVC, ref.getBases());

View File

@ -28,6 +28,7 @@ public class VCFWriter {
BufferedWriter mWriter;
private boolean writingVCF40Format;
private String PASSES_FILTERS_STRING = null;
// our genotype sample fields
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
@ -47,8 +48,8 @@ public class VCFWriter {
// default values
private static final String UNFILTERED = ".";
private static final String PASSES_FILTERS = "0";
private static final String PASSES_FILTERS_VCF_4_0 = "PASS";
private static final String PASSES_FILTERS_VCF3 = "0";
private static final String PASSES_FILTERS_VCF4 = "PASS";
private static final String EMPTY_INFO_FIELD = ".";
private static final String EMPTY_ID_FIELD = ".";
private static final String EMPTY_ALLELE_FIELD = ".";
@ -66,12 +67,15 @@ public class VCFWriter {
public VCFWriter(File location, boolean useVCF4Format) {
this.writingVCF40Format = useVCF4Format;
this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
FileOutputStream output;
try {
output = new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new RuntimeException("Unable to create VCF file at location: " + location);
}
mWriter = new BufferedWriter(new OutputStreamWriter(output));
}
@ -87,6 +91,7 @@ public class VCFWriter {
}
public VCFWriter(OutputStream output, boolean useVCF4Format) {
this.writingVCF40Format = useVCF4Format;
this.PASSES_FILTERS_STRING = useVCF4Format ? PASSES_FILTERS_VCF4 : PASSES_FILTERS_VCF3;
mWriter = new BufferedWriter(new OutputStreamWriter(output));
}
@ -238,8 +243,8 @@ public class VCFWriter {
// TODO- clean up these flags and associated code
boolean filtersWereAppliedToContext = true;
List<String> allowedGenotypeAttributeKeys = null;
boolean filtersWereAppliedToGenotypes = false;
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
boolean filtersWereAppliedToGenotypes = true;
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_STRING : UNFILTERED);
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
@ -251,8 +256,6 @@ public class VCFWriter {
ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");
// search for reference allele and find trailing and padding at the end.
if (originalAlleles != null) {
Allele originalReferenceAllele = null;
@ -339,7 +342,7 @@ public class VCFWriter {
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
vcfGenotypeAttributeKeys.add(key);
}
if ( filtersWereAppliedToGenotypes )
if ( filtersWereAppliedToGenotypes ) // todo -- should be calculated
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
}
String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
@ -382,7 +385,7 @@ public class VCFWriter {
val = pileup.size();
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
// VCF 4.0 key for no filters is "."
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : UNFILTERED;
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS_STRING;
}