Updates to fix some bugs in merger. Now able to merge into project wide indel VCF files. Integration teests coming tomorrow
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3727 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7be8c35eb2
commit
b934cc7554
|
|
@ -1,3 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broad.tribble.vcf;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -56,6 +80,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
this.lineType = lineType;
|
||||
}
|
||||
|
||||
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
|
||||
super(lineType.toString(), "", version);
|
||||
this.name = name;
|
||||
this.count = count;
|
||||
this.type = type;
|
||||
this.description = description;
|
||||
this.lineType = lineType;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
|
|
@ -111,6 +144,12 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
type == other.type;
|
||||
}
|
||||
|
||||
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
|
||||
return count == other.count &&
|
||||
type == other.type &&
|
||||
name.equals(other.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
|
||||
* @return true if we do, false otherwise
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
|
|
@ -35,7 +37,10 @@ public class VCFGenotypeRecord {
|
|||
|
||||
// what kind of phasing this genotype has
|
||||
public enum PHASE {
|
||||
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||
UNPHASED("/"), PHASED("|"), PHASED_SWITCH_PROB("\\"); // , UNKNOWN
|
||||
|
||||
String genotypeSeparator;
|
||||
PHASE(String sep) { this.genotypeSeparator = sep; }
|
||||
}
|
||||
|
||||
// our record
|
||||
|
|
@ -98,14 +103,12 @@ public class VCFGenotypeRecord {
|
|||
*/
|
||||
static PHASE determinePhase(String phase) {
|
||||
// find the phasing information
|
||||
if (phase.equals("/"))
|
||||
return PHASE.UNPHASED;
|
||||
else if (phase.equals("|"))
|
||||
return PHASE.PHASED;
|
||||
else if (phase.equals("\\"))
|
||||
return PHASE.PHASED_SWITCH_PROB;
|
||||
else
|
||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||
for ( PHASE p : PHASE.values() ) {
|
||||
if (phase.equals(p.genotypeSeparator))
|
||||
return p;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unknown genotype phasing parameter: " + phase);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -206,7 +209,7 @@ public class VCFGenotypeRecord {
|
|||
}
|
||||
|
||||
public int getPloidy() {
|
||||
return 2;
|
||||
return mGenotypeAlleles.size();
|
||||
}
|
||||
|
||||
public VCFRecord getRecord() {
|
||||
|
|
@ -214,32 +217,15 @@ public class VCFGenotypeRecord {
|
|||
}
|
||||
|
||||
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
|
||||
String str = "";
|
||||
boolean first = true;
|
||||
List<String> alleleStrings = new ArrayList<String>(altAlleles.size());
|
||||
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
||||
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||
str += VCFGenotypeRecord.EMPTY_ALLELE;
|
||||
alleleStrings.add(VCFGenotypeRecord.EMPTY_ALLELE);
|
||||
else
|
||||
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
||||
if (first) {
|
||||
switch (mPhaseType) {
|
||||
case UNPHASED:
|
||||
str += "/";
|
||||
break;
|
||||
case PHASED:
|
||||
str += "|";
|
||||
break;
|
||||
case PHASED_SWITCH_PROB:
|
||||
str += "\\";
|
||||
break;
|
||||
case UNKNOWN:
|
||||
throw new UnsupportedOperationException("Unknown phase type");
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
alleleStrings.add(String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0));
|
||||
}
|
||||
return str;
|
||||
|
||||
return Utils.join(mPhaseType.genotypeSeparator, alleleStrings);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@ package org.broad.tribble.vcf;
|
|||
*/
|
||||
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
||||
|
||||
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, VCFHeaderVersion version) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO, version);
|
||||
}
|
||||
|
||||
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
|
@ -23,4 +27,4 @@ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
|||
boolean allowFlagValues() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ public class VCFReaderUtils {
|
|||
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
// parameters to create the VCF genotype record
|
||||
HashMap<String, String> tagToValue = new HashMap<String, String>();
|
||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNPHASED;
|
||||
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
|
||||
|
||||
for (String key : keyStrings) {
|
||||
|
|
|
|||
|
|
@ -183,10 +183,10 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
* @param cache
|
||||
* @return
|
||||
*/
|
||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
private List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||
if ( GT.length() != 3 && GT.length() != 1 )
|
||||
throw new StingException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
|
||||
throw new VCFParserException("Unreasonable number of alleles: " + "GT=" + GT + " length=" + GT.length()); // 0/1 => barf on 10/0
|
||||
|
||||
List<Allele> GTAlleles = cache.get(GT);
|
||||
|
||||
|
|
@ -258,7 +258,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
int count = 0;
|
||||
for (String attr : attributes)
|
||||
if (Collections.binarySearch(fields,attr) < 0)
|
||||
throw new StingException("Unable to find field describing attribute " + attr);
|
||||
throw new VCFParserException("Unable to find field describing attribute " + attr);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -282,7 +282,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
// ref
|
||||
if (!checkAllele(ref, true))
|
||||
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
|
||||
throw new VCFParserException("Unable to parse out correct reference allele, we saw = " + ref);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
|
|
@ -301,13 +301,13 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
* @param isRef are we the reference allele?
|
||||
* @return true if the allele is fine, false otherwise
|
||||
*/
|
||||
private static boolean checkAllele(String allele,boolean isRef) {
|
||||
private boolean checkAllele(String allele,boolean isRef) {
|
||||
if (allele.contains("<")) {
|
||||
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
|
||||
return false;
|
||||
}
|
||||
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
||||
throw new VCFParserException("Unparsable vcf record with allele " + allele);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
@ -320,7 +320,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
*/
|
||||
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
||||
if (!checkAllele(alt,isRef))
|
||||
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
|
||||
throw new VCFParserException("Unable to parse out correct alt allele, we saw = " + alt);
|
||||
|
||||
Allele allele = Allele.create(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
|
|
@ -403,10 +403,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
}
|
||||
|
||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
// } catch ( Exception e ) {
|
||||
// // todo -- we need a local exception so that we can remember the location of the throw but also see the line
|
||||
// throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
|
||||
// }
|
||||
}
|
||||
|
||||
class VCFParserException extends StingException {
|
||||
public VCFParserException(String msg) {
|
||||
super("Line " + lineNo + " generated parser exception " + msg);
|
||||
}
|
||||
|
||||
public VCFParserException(String msg, Throwable throwable) {
|
||||
super("Line " + lineNo + " generated parser exception " + msg, throwable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -439,7 +445,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||
throw new VCFParserException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||
|
||||
int genotypeAlleleLocation = -1;
|
||||
if (nGTKeys > 1) {
|
||||
|
|
@ -449,7 +455,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
gtAttributes.put(genotypeKeyArray[i],".");
|
||||
else if (genotypeKeyArray[i].equals("GT"))
|
||||
if (i != 0)
|
||||
throw new StingException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
||||
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
||||
else
|
||||
genotypeAlleleLocation = i;
|
||||
else if (genotypeKeyArray[i].equals("GQ"))
|
||||
|
|
@ -464,7 +470,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
}
|
||||
// check to make sure we found a gentoype field
|
||||
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
|
||||
if (genotypeAlleleLocation < 0) throw new VCFParserException("Unable to find required field GT for record " + locAndAlleles.first);
|
||||
|
||||
// assuming allele list length in the single digits, could be bad. Check for > 1 for haploid genotypes
|
||||
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
|
||||
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
||||
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
}
|
||||
|
||||
|
|
@ -123,13 +124,18 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else {
|
||||
String lineName = ((VCFCompoundHeaderLine) line).getName();
|
||||
String otherName = ((VCFCompoundHeaderLine) other).getName();
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||
|
||||
// if the names are the same, but the values are different, we need to quit
|
||||
if (lineName.equals(otherName) && !line.equals(other))
|
||||
if (! (compLine).equalsExcludingDescription(compOther) )
|
||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||
if ( ! compLine.getDescription().equals(compOther) )
|
||||
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
|
||||
} else {
|
||||
// we are not equal, but we're not anything special either
|
||||
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||
}
|
||||
} else {
|
||||
map.put(key, line);
|
||||
|
|
@ -179,4 +185,4 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
if ( vcfWriter != null )
|
||||
vcfWriter.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -364,11 +364,7 @@ public class VCFWriter {
|
|||
continue;
|
||||
|
||||
|
||||
Object val;
|
||||
if (g.hasAttribute(key))
|
||||
val = g.getAttribute(key);
|
||||
else
|
||||
val = new String(MISSING_GENOTYPE_FIELD);
|
||||
Object val = g.hasAttribute(key) ? g.getAttribute(key) : MISSING_GENOTYPE_FIELD;
|
||||
|
||||
// some exceptions
|
||||
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue