Updates to fix some bugs in merger. Now able to merge into project wide indel VCF files. Integration teests coming tomorrow
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3727 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7be8c35eb2
commit
b934cc7554
|
|
@ -1,3 +1,27 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.broad.tribble.vcf;
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -56,6 +80,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
||||||
this.lineType = lineType;
|
this.lineType = lineType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
|
||||||
|
super(lineType.toString(), "", version);
|
||||||
|
this.name = name;
|
||||||
|
this.count = count;
|
||||||
|
this.type = type;
|
||||||
|
this.description = description;
|
||||||
|
this.lineType = lineType;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a VCF format header line
|
* create a VCF format header line
|
||||||
*
|
*
|
||||||
|
|
@ -111,6 +144,12 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
||||||
type == other.type;
|
type == other.type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
|
||||||
|
return count == other.count &&
|
||||||
|
type == other.type &&
|
||||||
|
name.equals(other.name);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
|
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
|
||||||
* @return true if we do, false otherwise
|
* @return true if we do, false otherwise
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broad.tribble.vcf;
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -35,7 +37,10 @@ public class VCFGenotypeRecord {
|
||||||
|
|
||||||
// what kind of phasing this genotype has
|
// what kind of phasing this genotype has
|
||||||
public enum PHASE {
|
public enum PHASE {
|
||||||
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
UNPHASED("/"), PHASED("|"), PHASED_SWITCH_PROB("\\"); // , UNKNOWN
|
||||||
|
|
||||||
|
String genotypeSeparator;
|
||||||
|
PHASE(String sep) { this.genotypeSeparator = sep; }
|
||||||
}
|
}
|
||||||
|
|
||||||
// our record
|
// our record
|
||||||
|
|
@ -98,14 +103,12 @@ public class VCFGenotypeRecord {
|
||||||
*/
|
*/
|
||||||
static PHASE determinePhase(String phase) {
|
static PHASE determinePhase(String phase) {
|
||||||
// find the phasing information
|
// find the phasing information
|
||||||
if (phase.equals("/"))
|
for ( PHASE p : PHASE.values() ) {
|
||||||
return PHASE.UNPHASED;
|
if (phase.equals(p.genotypeSeparator))
|
||||||
else if (phase.equals("|"))
|
return p;
|
||||||
return PHASE.PHASED;
|
}
|
||||||
else if (phase.equals("\\"))
|
|
||||||
return PHASE.PHASED_SWITCH_PROB;
|
throw new IllegalArgumentException("Unknown genotype phasing parameter: " + phase);
|
||||||
else
|
|
||||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -206,7 +209,7 @@ public class VCFGenotypeRecord {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getPloidy() {
|
public int getPloidy() {
|
||||||
return 2;
|
return mGenotypeAlleles.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public VCFRecord getRecord() {
|
public VCFRecord getRecord() {
|
||||||
|
|
@ -214,32 +217,15 @@ public class VCFGenotypeRecord {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
|
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
|
||||||
String str = "";
|
List<String> alleleStrings = new ArrayList<String>(altAlleles.size());
|
||||||
boolean first = true;
|
|
||||||
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
||||||
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||||
str += VCFGenotypeRecord.EMPTY_ALLELE;
|
alleleStrings.add(VCFGenotypeRecord.EMPTY_ALLELE);
|
||||||
else
|
else
|
||||||
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
alleleStrings.add(String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0));
|
||||||
if (first) {
|
|
||||||
switch (mPhaseType) {
|
|
||||||
case UNPHASED:
|
|
||||||
str += "/";
|
|
||||||
break;
|
|
||||||
case PHASED:
|
|
||||||
str += "|";
|
|
||||||
break;
|
|
||||||
case PHASED_SWITCH_PROB:
|
|
||||||
str += "\\";
|
|
||||||
break;
|
|
||||||
case UNKNOWN:
|
|
||||||
throw new UnsupportedOperationException("Unknown phase type");
|
|
||||||
}
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return str;
|
|
||||||
|
|
||||||
|
return Utils.join(mPhaseType.genotypeSeparator, alleleStrings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,10 @@ package org.broad.tribble.vcf;
|
||||||
*/
|
*/
|
||||||
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
||||||
|
|
||||||
|
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, VCFHeaderVersion version) {
|
||||||
|
super(name, count, type, description, SupportedHeaderLineType.INFO, version);
|
||||||
|
}
|
||||||
|
|
||||||
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||||
}
|
}
|
||||||
|
|
@ -23,4 +27,4 @@ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
||||||
boolean allowFlagValues() {
|
boolean allowFlagValues() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -136,7 +136,7 @@ public class VCFReaderUtils {
|
||||||
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
|
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
|
||||||
// parameters to create the VCF genotype record
|
// parameters to create the VCF genotype record
|
||||||
HashMap<String, String> tagToValue = new HashMap<String, String>();
|
HashMap<String, String> tagToValue = new HashMap<String, String>();
|
||||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNPHASED;
|
||||||
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
|
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
|
||||||
|
|
||||||
for (String key : keyStrings) {
|
for (String key : keyStrings) {
|
||||||
|
|
|
||||||
|
|
@ -183,10 +183,10 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
* @param cache
|
* @param cache
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
private List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||||
if ( GT.length() != 3 && GT.length() != 1 )
|
if ( GT.length() != 3 && GT.length() != 1 )
|
||||||
throw new StingException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
|
throw new VCFParserException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
|
||||||
|
|
||||||
List<Allele> GTAlleles = cache.get(GT);
|
List<Allele> GTAlleles = cache.get(GT);
|
||||||
|
|
||||||
|
|
@ -258,7 +258,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (String attr : attributes)
|
for (String attr : attributes)
|
||||||
if (Collections.binarySearch(fields,attr) < 0)
|
if (Collections.binarySearch(fields,attr) < 0)
|
||||||
throw new StingException("Unable to find field describing attribute " + attr);
|
throw new VCFParserException("Unable to find field describing attribute " + attr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -282,7 +282,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||||
// ref
|
// ref
|
||||||
if (!checkAllele(ref, true))
|
if (!checkAllele(ref, true))
|
||||||
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
|
throw new VCFParserException("Unable to parse out correct reference allele, we saw = " + ref);
|
||||||
Allele refAllele = Allele.create(ref, true);
|
Allele refAllele = Allele.create(ref, true);
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
|
|
||||||
|
|
@ -301,13 +301,13 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
* @param isRef are we the reference allele?
|
* @param isRef are we the reference allele?
|
||||||
* @return true if the allele is fine, false otherwise
|
* @return true if the allele is fine, false otherwise
|
||||||
*/
|
*/
|
||||||
private static boolean checkAllele(String allele,boolean isRef) {
|
private boolean checkAllele(String allele,boolean isRef) {
|
||||||
if (allele.contains("<")) {
|
if (allele.contains("<")) {
|
||||||
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
|
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
||||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
throw new VCFParserException("Unparsable vcf record with allele " + allele);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -320,7 +320,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
*/
|
*/
|
||||||
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
||||||
if (!checkAllele(alt,isRef))
|
if (!checkAllele(alt,isRef))
|
||||||
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
|
throw new VCFParserException("Unable to parse out correct alt allele, we saw = " + alt);
|
||||||
|
|
||||||
Allele allele = Allele.create(alt, false);
|
Allele allele = Allele.create(alt, false);
|
||||||
if ( ! allele.isNoCall() )
|
if ( ! allele.isNoCall() )
|
||||||
|
|
@ -403,10 +403,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
}
|
}
|
||||||
|
|
||||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||||
// } catch ( Exception e ) {
|
}
|
||||||
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
|
|
||||||
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
|
class VCFParserException extends StingException {
|
||||||
// }
|
public VCFParserException(String msg) {
|
||||||
|
super("Line " + lineNo + " generated parser exception " + msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFParserException(String msg, Throwable throwable) {
|
||||||
|
super("Line " + lineNo + " generated parser exception " + msg, throwable);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -439,7 +445,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
|
|
||||||
// check to see if the value list is longer than the key list, which is a problem
|
// check to see if the value list is longer than the key list, which is a problem
|
||||||
if (nGTKeys < GTValueSplitSize)
|
if (nGTKeys < GTValueSplitSize)
|
||||||
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
throw new VCFParserException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||||
|
|
||||||
int genotypeAlleleLocation = -1;
|
int genotypeAlleleLocation = -1;
|
||||||
if (nGTKeys > 1) {
|
if (nGTKeys > 1) {
|
||||||
|
|
@ -449,7 +455,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
gtAttributes.put(genotypeKeyArray[i],".");
|
gtAttributes.put(genotypeKeyArray[i],".");
|
||||||
else if (genotypeKeyArray[i].equals("GT"))
|
else if (genotypeKeyArray[i].equals("GT"))
|
||||||
if (i != 0)
|
if (i != 0)
|
||||||
throw new StingException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
||||||
else
|
else
|
||||||
genotypeAlleleLocation = i;
|
genotypeAlleleLocation = i;
|
||||||
else if (genotypeKeyArray[i].equals("GQ"))
|
else if (genotypeKeyArray[i].equals("GQ"))
|
||||||
|
|
@ -464,7 +470,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||||
}
|
}
|
||||||
// check to make sure we found a gentoype field
|
// check to make sure we found a gentoype field
|
||||||
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
|
if (genotypeAlleleLocation < 0) throw new VCFParserException("Unable to find required field GT for record " + locAndAlleles.first);
|
||||||
|
|
||||||
// assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes
|
// assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes
|
||||||
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
||||||
|
|
|
||||||
|
|
@ -84,6 +84,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
|
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
|
||||||
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
||||||
|
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
|
||||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -123,13 +124,18 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
String otherName = ((VCFFilterHeaderLine) other).getName();
|
String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||||
if ( ! lineName.equals(otherName) )
|
if ( ! lineName.equals(otherName) )
|
||||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||||
} else {
|
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||||
String lineName = ((VCFCompoundHeaderLine) line).getName();
|
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||||
String otherName = ((VCFCompoundHeaderLine) other).getName();
|
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||||
|
|
||||||
// if the names are the same, but the values are different, we need to quit
|
// if the names are the same, but the values are different, we need to quit
|
||||||
if (lineName.equals(otherName) && !line.equals(other))
|
if (! (compLine).equalsExcludingDescription(compOther) )
|
||||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||||
|
if ( ! compLine.getDescription().equals(compOther) )
|
||||||
|
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
|
||||||
|
} else {
|
||||||
|
// we are not equal, but we're not anything special either
|
||||||
|
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
map.put(key, line);
|
map.put(key, line);
|
||||||
|
|
@ -179,4 +185,4 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
if ( vcfWriter != null )
|
if ( vcfWriter != null )
|
||||||
vcfWriter.close();
|
vcfWriter.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -364,11 +364,7 @@ public class VCFWriter {
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
Object val;
|
Object val = g.hasAttribute(key) ? g.getAttribute(key) : MISSING_GENOTYPE_FIELD;
|
||||||
if (g.hasAttribute(key))
|
|
||||||
val = g.getAttribute(key);
|
|
||||||
else
|
|
||||||
val = new String(MISSING_GENOTYPE_FIELD);
|
|
||||||
|
|
||||||
// some exceptions
|
// some exceptions
|
||||||
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
|
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue