Updates to fix some bugs in merger. Now able to merge into project wide indel VCF files. Integration teests coming tomorrow

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3727 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-07 03:16:33 +00:00
parent 7be8c35eb2
commit b934cc7554
7 changed files with 94 additions and 57 deletions

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broad.tribble.vcf;
import java.util.Arrays;
@ -56,6 +80,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
this.lineType = lineType;
}
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
super(lineType.toString(), "", version);
this.name = name;
this.count = count;
this.type = type;
this.description = description;
this.lineType = lineType;
}
/**
* create a VCF format header line
*
@ -111,6 +144,12 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
type == other.type;
}
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
return count == other.count &&
type == other.type &&
name.equals(other.name);
}
/**
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
* @return true if we do, false otherwise

View File

@ -1,5 +1,7 @@
package org.broad.tribble.vcf;
import org.broadinstitute.sting.utils.Utils;
import java.util.*;
@ -35,7 +37,10 @@ public class VCFGenotypeRecord {
// what kind of phasing this genotype has
public enum PHASE {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
UNPHASED("/"), PHASED("|"), PHASED_SWITCH_PROB("\\"); // , UNKNOWN
String genotypeSeparator;
PHASE(String sep) { this.genotypeSeparator = sep; }
}
// our record
@ -98,14 +103,12 @@ public class VCFGenotypeRecord {
*/
static PHASE determinePhase(String phase) {
// find the phasing information
if (phase.equals("/"))
return PHASE.UNPHASED;
else if (phase.equals("|"))
return PHASE.PHASED;
else if (phase.equals("\\"))
return PHASE.PHASED_SWITCH_PROB;
else
throw new IllegalArgumentException("Unknown genotype phasing parameter");
for ( PHASE p : PHASE.values() ) {
if (phase.equals(p.genotypeSeparator))
return p;
}
throw new IllegalArgumentException("Unknown genotype phasing parameter: " + phase);
}
@ -206,7 +209,7 @@ public class VCFGenotypeRecord {
}
public int getPloidy() {
return 2;
return mGenotypeAlleles.size();
}
public VCFRecord getRecord() {
@ -214,32 +217,15 @@ public class VCFGenotypeRecord {
}
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = "";
boolean first = true;
List<String> alleleStrings = new ArrayList<String>(altAlleles.size());
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
str += VCFGenotypeRecord.EMPTY_ALLELE;
alleleStrings.add(VCFGenotypeRecord.EMPTY_ALLELE);
else
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {
switch (mPhaseType) {
case UNPHASED:
str += "/";
break;
case PHASED:
str += "|";
break;
case PHASED_SWITCH_PROB:
str += "\\";
break;
case UNKNOWN:
throw new UnsupportedOperationException("Unknown phase type");
}
first = false;
}
alleleStrings.add(String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0));
}
return str;
return Utils.join(mPhaseType.genotypeSeparator, alleleStrings);
}
@Override

View File

@ -10,6 +10,10 @@ package org.broad.tribble.vcf;
*/
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, VCFHeaderVersion version) {
super(name, count, type, description, SupportedHeaderLineType.INFO, version);
}
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.INFO);
}
@ -23,4 +27,4 @@ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
boolean allowFlagValues() {
return true;
}
}
}

View File

@ -136,7 +136,7 @@ public class VCFReaderUtils {
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
// parameters to create the VCF genotype record
HashMap<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNPHASED;
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
for (String key : keyStrings) {

View File

@ -183,10 +183,10 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
* @param cache
* @return
*/
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
private List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// this should cache results [since they are immutable] and return a single object for each genotype
if ( GT.length() != 3 && GT.length() != 1 )
throw new StingException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
throw new VCFParserException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
List<Allele> GTAlleles = cache.get(GT);
@ -258,7 +258,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
int count = 0;
for (String attr : attributes)
if (Collections.binarySearch(fields,attr) < 0)
throw new StingException("Unable to find field describing attribute " + attr);
throw new VCFParserException("Unable to find field describing attribute " + attr);
}
}
@ -282,7 +282,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref
if (!checkAllele(ref, true))
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
throw new VCFParserException("Unable to parse out correct reference allele, we saw = " + ref);
Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele);
@ -301,13 +301,13 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
* @param isRef are we the reference allele?
* @return true if the allele is fine, false otherwise
*/
private static boolean checkAllele(String allele,boolean isRef) {
private boolean checkAllele(String allele,boolean isRef) {
if (allele.contains("<")) {
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
return false;
}
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
throw new StingException("Unparsable vcf record with allele " + allele);
throw new VCFParserException("Unparsable vcf record with allele " + allele);
}
return true;
}
@ -320,7 +320,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
*/
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
if (!checkAllele(alt,isRef))
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
throw new VCFParserException("Unable to parse out correct alt allele, we saw = " + alt);
Allele allele = Allele.create(alt, false);
if ( ! allele.isNoCall() )
@ -403,10 +403,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
}
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
// } catch ( Exception e ) {
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
// }
}
class VCFParserException extends StingException {
public VCFParserException(String msg) {
super("Line " + lineNo + " generated parser exception " + msg);
}
public VCFParserException(String msg, Throwable throwable) {
super("Line " + lineNo + " generated parser exception " + msg, throwable);
}
}
/**
@ -439,7 +445,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
throw new VCFParserException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys > 1) {
@ -449,7 +455,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
gtAttributes.put(genotypeKeyArray[i],".");
else if (genotypeKeyArray[i].equals("GT"))
if (i != 0)
throw new StingException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
else
genotypeAlleleLocation = i;
else if (genotypeKeyArray[i].equals("GQ"))
@ -464,7 +470,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
}
// check to make sure we found a gentoype field
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
if (genotypeAlleleLocation < 0) throw new VCFParserException("Unable to find required field GT for record " + locAndAlleles.first);
// assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';

View File

@ -84,6 +84,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
}
@ -123,13 +124,18 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
String otherName = ((VCFFilterHeaderLine) other).getName();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else {
String lineName = ((VCFCompoundHeaderLine) line).getName();
String otherName = ((VCFCompoundHeaderLine) other).getName();
} else if ( line instanceof VCFCompoundHeaderLine ) {
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit
if (lineName.equals(otherName) && !line.equals(other))
if (! (compLine).equalsExcludingDescription(compOther) )
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
if ( ! compLine.getDescription().equals(compOther) )
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
} else {
// we are not equal, but we're not anything special either
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
}
} else {
map.put(key, line);
@ -179,4 +185,4 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
if ( vcfWriter != null )
vcfWriter.close();
}
}
}

View File

@ -364,11 +364,7 @@ public class VCFWriter {
continue;
Object val;
if (g.hasAttribute(key))
val = g.getAttribute(key);
else
val = new String(MISSING_GENOTYPE_FIELD);
Object val = g.hasAttribute(key) ? g.getAttribute(key) : MISSING_GENOTYPE_FIELD;
// some exceptions
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {