Updates to fix some bugs in merger. Now able to merge into project wide indel VCF files. Integration teests coming tomorrow

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3727 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-07 03:16:33 +00:00
parent 7be8c35eb2
commit b934cc7554
7 changed files with 94 additions and 57 deletions

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broad.tribble.vcf; package org.broad.tribble.vcf;
import java.util.Arrays; import java.util.Arrays;
@ -56,6 +80,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
this.lineType = lineType; this.lineType = lineType;
} }
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
super(lineType.toString(), "", version);
this.name = name;
this.count = count;
this.type = type;
this.description = description;
this.lineType = lineType;
}
/** /**
* create a VCF format header line * create a VCF format header line
* *
@ -111,6 +144,12 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
type == other.type; type == other.type;
} }
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
return count == other.count &&
type == other.type &&
name.equals(other.name);
}
/** /**
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
* @return true if we do, false otherwise * @return true if we do, false otherwise

View File

@ -1,5 +1,7 @@
package org.broad.tribble.vcf; package org.broad.tribble.vcf;
import org.broadinstitute.sting.utils.Utils;
import java.util.*; import java.util.*;
@ -35,7 +37,10 @@ public class VCFGenotypeRecord {
// what kind of phasing this genotype has // what kind of phasing this genotype has
public enum PHASE { public enum PHASE {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN UNPHASED("/"), PHASED("|"), PHASED_SWITCH_PROB("\\"); // , UNKNOWN
String genotypeSeparator;
PHASE(String sep) { this.genotypeSeparator = sep; }
} }
// our record // our record
@ -98,14 +103,12 @@ public class VCFGenotypeRecord {
*/ */
static PHASE determinePhase(String phase) { static PHASE determinePhase(String phase) {
// find the phasing information // find the phasing information
if (phase.equals("/")) for ( PHASE p : PHASE.values() ) {
return PHASE.UNPHASED; if (phase.equals(p.genotypeSeparator))
else if (phase.equals("|")) return p;
return PHASE.PHASED; }
else if (phase.equals("\\"))
return PHASE.PHASED_SWITCH_PROB; throw new IllegalArgumentException("Unknown genotype phasing parameter: " + phase);
else
throw new IllegalArgumentException("Unknown genotype phasing parameter");
} }
@ -206,7 +209,7 @@ public class VCFGenotypeRecord {
} }
public int getPloidy() { public int getPloidy() {
return 2; return mGenotypeAlleles.size();
} }
public VCFRecord getRecord() { public VCFRecord getRecord() {
@ -214,32 +217,15 @@ public class VCFGenotypeRecord {
} }
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) { private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = ""; List<String> alleleStrings = new ArrayList<String>(altAlleles.size());
boolean first = true;
for (VCFGenotypeEncoding allele : mGenotypeAlleles) { for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED) if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
str += VCFGenotypeRecord.EMPTY_ALLELE; alleleStrings.add(VCFGenotypeRecord.EMPTY_ALLELE);
else else
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0); alleleStrings.add(String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0));
if (first) {
switch (mPhaseType) {
case UNPHASED:
str += "/";
break;
case PHASED:
str += "|";
break;
case PHASED_SWITCH_PROB:
str += "\\";
break;
case UNKNOWN:
throw new UnsupportedOperationException("Unknown phase type");
}
first = false;
}
} }
return str;
return Utils.join(mPhaseType.genotypeSeparator, alleleStrings);
} }
@Override @Override

View File

@ -10,6 +10,10 @@ package org.broad.tribble.vcf;
*/ */
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, VCFHeaderVersion version) {
super(name, count, type, description, SupportedHeaderLineType.INFO, version);
}
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.INFO); super(name, count, type, description, SupportedHeaderLineType.INFO);
} }

View File

@ -136,7 +136,7 @@ public class VCFReaderUtils {
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) { public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
// parameters to create the VCF genotype record // parameters to create the VCF genotype record
HashMap<String, String> tagToValue = new HashMap<String, String>(); HashMap<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN; VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNPHASED;
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>(); List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
for (String key : keyStrings) { for (String key : keyStrings) {

View File

@ -183,10 +183,10 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
* @param cache * @param cache
* @return * @return
*/ */
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) { private List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// this should cache results [since they are immutable] and return a single object for each genotype // this should cache results [since they are immutable] and return a single object for each genotype
if ( GT.length() != 3 && GT.length() != 1 ) if ( GT.length() != 3 && GT.length() != 1 )
throw new StingException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0 throw new VCFParserException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
List<Allele> GTAlleles = cache.get(GT); List<Allele> GTAlleles = cache.get(GT);
@ -258,7 +258,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
int count = 0; int count = 0;
for (String attr : attributes) for (String attr : attributes)
if (Collections.binarySearch(fields,attr) < 0) if (Collections.binarySearch(fields,attr) < 0)
throw new StingException("Unable to find field describing attribute " + attr); throw new VCFParserException("Unable to find field describing attribute " + attr);
} }
} }
@ -282,7 +282,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref // ref
if (!checkAllele(ref, true)) if (!checkAllele(ref, true))
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref); throw new VCFParserException("Unable to parse out correct reference allele, we saw = " + ref);
Allele refAllele = Allele.create(ref, true); Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele); alleles.add(refAllele);
@ -301,13 +301,13 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
* @param isRef are we the reference allele? * @param isRef are we the reference allele?
* @return true if the allele is fine, false otherwise * @return true if the allele is fine, false otherwise
*/ */
private static boolean checkAllele(String allele,boolean isRef) { private boolean checkAllele(String allele,boolean isRef) {
if (allele.contains("<")) { if (allele.contains("<")) {
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele); Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
return false; return false;
} }
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) { else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
throw new StingException("Unparsable vcf record with allele " + allele); throw new VCFParserException("Unparsable vcf record with allele " + allele);
} }
return true; return true;
} }
@ -320,7 +320,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
*/ */
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) { private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
if (!checkAllele(alt,isRef)) if (!checkAllele(alt,isRef))
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt); throw new VCFParserException("Unable to parse out correct alt allele, we saw = " + alt);
Allele allele = Allele.create(alt, false); Allele allele = Allele.create(alt, false);
if ( ! allele.isNoCall() ) if ( ! allele.isNoCall() )
@ -403,10 +403,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
} }
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
// } catch ( Exception e ) { }
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e); class VCFParserException extends StingException {
// } public VCFParserException(String msg) {
super("Line " + lineNo + " generated parser exception " + msg);
}
public VCFParserException(String msg, Throwable throwable) {
super("Line " + lineNo + " generated parser exception " + msg, throwable);
}
} }
/** /**
@ -439,7 +445,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
// check to see if the value list is longer than the key list, which is a problem // check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize) if (nGTKeys < GTValueSplitSize)
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]); throw new VCFParserException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1; int genotypeAlleleLocation = -1;
if (nGTKeys > 1) { if (nGTKeys > 1) {
@ -449,7 +455,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
gtAttributes.put(genotypeKeyArray[i],"."); gtAttributes.put(genotypeKeyArray[i],".");
else if (genotypeKeyArray[i].equals("GT")) else if (genotypeKeyArray[i].equals("GT"))
if (i != 0) if (i != 0)
throw new StingException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first); throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
else else
genotypeAlleleLocation = i; genotypeAlleleLocation = i;
else if (genotypeKeyArray[i].equals("GQ")) else if (genotypeKeyArray[i].equals("GQ"))
@ -464,7 +470,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
} }
// check to make sure we found a gentoype field // check to make sure we found a gentoype field
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first); if (genotypeAlleleLocation < 0) throw new VCFParserException("Unable to find required field GT for record " + locAndAlleles.first);
// assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes // assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';

View File

@ -84,6 +84,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values()); Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
headerLines.add(new VCFHeaderLine("source", "CombineVariants")); headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
} }
@ -123,13 +124,18 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
String otherName = ((VCFFilterHeaderLine) other).getName(); String otherName = ((VCFFilterHeaderLine) other).getName();
if ( ! lineName.equals(otherName) ) if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other ); throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else { } else if ( line instanceof VCFCompoundHeaderLine ) {
String lineName = ((VCFCompoundHeaderLine) line).getName(); VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
String otherName = ((VCFCompoundHeaderLine) other).getName(); VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit // if the names are the same, but the values are different, we need to quit
if (lineName.equals(otherName) && !line.equals(other)) if (! (compLine).equalsExcludingDescription(compOther) )
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
if ( ! compLine.getDescription().equals(compOther) )
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
} else {
// we are not equal, but we're not anything special either
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
} }
} else { } else {
map.put(key, line); map.put(key, line);

View File

@ -364,11 +364,7 @@ public class VCFWriter {
continue; continue;
Object val; Object val = g.hasAttribute(key) ? g.getAttribute(key) : MISSING_GENOTYPE_FIELD;
if (g.hasAttribute(key))
val = g.getAttribute(key);
else
val = new String(MISSING_GENOTYPE_FIELD);
// some exceptions // some exceptions
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) { if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {