First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version. b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted. c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format. d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved. e) Several VCF 4.0 writer bugs are now solved. f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0. g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension. Pending issues: a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes. b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
75bea4881a
commit
55b756f1cc
|
|
@ -0,0 +1,124 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: Jul 13, 2010
|
||||
* Time: 3:57:01 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class VCF3Codec implements FeatureCodec {
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
private List<String> headerStrings = new ArrayList<String>();
|
||||
private VCFHeader header = null;
|
||||
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
||||
|
||||
|
||||
// some classes need to transform the line before
|
||||
private LineTransform transformer = null;
|
||||
|
||||
/**
|
||||
* Fast path to get the location of the Feature for indexing
|
||||
* @param line the input line to decode
|
||||
* @return
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
return reallyDecode(line, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a line as a Feature.
|
||||
*
|
||||
* @param line
|
||||
*
|
||||
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||
* a comment)
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
return reallyDecode(line, false);
|
||||
}
|
||||
|
||||
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
||||
// transform the line, if we have a transform to do
|
||||
if (transformer != null) line = transformer.lineTransform(line);
|
||||
if (line.startsWith("#"))
|
||||
return null;
|
||||
|
||||
// make a VCFRecord of the line and return it
|
||||
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
||||
if ( ! justLocationPlease ) rec.setHeader(header);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the # of header lines for this file. We use this to parse out the header
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public int readHeader(LineReader reader) {
|
||||
String line = "";
|
||||
try {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith("##")) {
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith("#")) {
|
||||
headerStrings.add(line);
|
||||
header = VCFReaderUtils.createHeader(headerStrings,version);
|
||||
return headerStrings.size();
|
||||
}
|
||||
else {
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
/**
|
||||
* @return VCFRecord.class
|
||||
*/
|
||||
public Class getFeatureType() {
|
||||
return VCFRecord.class;
|
||||
}
|
||||
|
||||
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||
if (!clazz.equals(VCFHeader.class))
|
||||
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
||||
return header;
|
||||
}
|
||||
|
||||
public static interface LineTransform {
|
||||
public String lineTransform(String line);
|
||||
}
|
||||
|
||||
public LineTransform getTransformer() {
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void setTransformer(LineTransform transformer) {
|
||||
this.transformer = transformer;
|
||||
}
|
||||
|
||||
public VCFHeaderVersion getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(VCFHeaderVersion version) {
|
||||
this.version = version;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
|
|||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -18,110 +19,4 @@ import java.util.List;
|
|||
*
|
||||
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
|
||||
*/
|
||||
public class VCFCodec implements FeatureCodec {
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
private List<String> headerStrings = new ArrayList<String>();
|
||||
private VCFHeader header = null;
|
||||
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
||||
|
||||
|
||||
// some classes need to transform the line before
|
||||
private LineTransform transformer = null;
|
||||
|
||||
/**
|
||||
* Fast path to get the location of the Feature for indexing
|
||||
* @param line the input line to decode
|
||||
* @return
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
return reallyDecode(line, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a line as a Feature.
|
||||
*
|
||||
* @param line
|
||||
*
|
||||
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||
* a comment)
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
return reallyDecode(line, false);
|
||||
}
|
||||
|
||||
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
||||
// transform the line, if we have a transform to do
|
||||
if (transformer != null) line = transformer.lineTransform(line);
|
||||
if (line.startsWith("#"))
|
||||
return null;
|
||||
|
||||
// make a VCFRecord of the line and return it
|
||||
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
||||
if ( ! justLocationPlease ) rec.setHeader(header);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the # of header lines for this file. We use this to parse out the header
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public int readHeader(LineReader reader) {
|
||||
String line = "";
|
||||
try {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith("##")) {
|
||||
if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv3" ) )
|
||||
throw new CodecLineParsingException("VCF codec can only parse VCF3 formated files. Your version line is " + line + ". If you want to parse VCF4, use VCF4 use VCF as the rod type");
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith("#")) {
|
||||
headerStrings.add(line);
|
||||
header = VCFReaderUtils.createHeader(headerStrings,version);
|
||||
return headerStrings.size();
|
||||
}
|
||||
else {
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
/**
|
||||
* @return VCFRecord.class
|
||||
*/
|
||||
public Class getFeatureType() {
|
||||
return VCFRecord.class;
|
||||
}
|
||||
|
||||
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||
if (!clazz.equals(VCFHeader.class))
|
||||
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
||||
return header;
|
||||
}
|
||||
|
||||
public static interface LineTransform {
|
||||
public String lineTransform(String line);
|
||||
}
|
||||
|
||||
public LineTransform getTransformer() {
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void setTransformer(LineTransform transformer) {
|
||||
this.transformer = transformer;
|
||||
}
|
||||
|
||||
public VCFHeaderVersion getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(VCFHeaderVersion version) {
|
||||
this.version = version;
|
||||
}
|
||||
}
|
||||
public class VCFCodec extends VCF4Codec {}
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
}
|
||||
|
||||
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
|
||||
super(lineType.toString(), "", version);
|
||||
super(lineType.toString(), "");
|
||||
this.name = name;
|
||||
this.count = count;
|
||||
this.type = type;
|
||||
|
|
@ -98,7 +98,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
*
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "", version);
|
||||
super(lineType.toString(), "");
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||
name = mapping.get("ID");
|
||||
count = version == VCFHeaderVersion.VCF4_0 ?
|
||||
|
|
|
|||
|
|
@ -34,22 +34,17 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader
|
|||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super("FILTER", "", version);
|
||||
super("FILTER", "");
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||
name = mapping.get("ID");
|
||||
description = mapping.get("Description");
|
||||
}
|
||||
|
||||
protected String toStringEncoding() {
|
||||
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||
return String.format("FILTER=%s,\"%s\"", name, description);
|
||||
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID", name);
|
||||
map.put("Description", description);
|
||||
return "FILTER=" + VCFHeaderLine.toStringEncoding(map);
|
||||
}
|
||||
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID", name);
|
||||
map.put("Description", description);
|
||||
return "FILTER=" + VCFHeaderLine.toStringEncoding(map);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
|
|
|
|||
|
|
@ -30,9 +30,6 @@ public class VCFHeader {
|
|||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
// our header version
|
||||
private VCFHeaderVersion version;
|
||||
|
||||
/** do we have genotying data? */
|
||||
private boolean hasGenotypingData = false;
|
||||
|
||||
|
|
@ -70,7 +67,6 @@ public class VCFHeader {
|
|||
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData )
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||
version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey());
|
||||
toRemove.add(line);
|
||||
}
|
||||
// remove old header lines for now,
|
||||
|
|
@ -98,10 +94,7 @@ public class VCFHeader {
|
|||
*/
|
||||
public Set<VCFHeaderLine> getMetaData() {
|
||||
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||
if (version == null)
|
||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString()));
|
||||
else
|
||||
lines.add(new VCFHeaderLine(version.getFormatString(), version.getVersionString()));
|
||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||
lines.addAll(mMetaData);
|
||||
return lines;
|
||||
}
|
||||
|
|
@ -129,19 +122,6 @@ public class VCFHeader {
|
|||
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* convert the header to a new VCF version
|
||||
* @param version the version to convert to
|
||||
*/
|
||||
public void setVersion(VCFHeaderVersion version) {
|
||||
if (version.equals(this.version))
|
||||
return; // we're all set, do nothing
|
||||
|
||||
// store the new version, and update each of the header lines
|
||||
this.version = version;
|
||||
for (VCFHeaderLine line : mMetaData)
|
||||
line.setVersion(version);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -43,19 +43,6 @@ public class VCFHeaderLine implements Comparable {
|
|||
private String mKey = null;
|
||||
private String mValue = null;
|
||||
|
||||
protected VCFHeaderVersion mVersion = null;
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public VCFHeaderLine(String key, String value, VCFHeaderVersion version) {
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
mVersion = version;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
|
|
@ -66,7 +53,6 @@ public class VCFHeaderLine implements Comparable {
|
|||
public VCFHeaderLine(String key, String value) {
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
mVersion = VCFHeaderVersion.VCF3_3;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -127,18 +113,6 @@ public class VCFHeaderLine implements Comparable {
|
|||
return toString().compareTo(other.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* set the version string, which resets the current stored string representation if the version changed
|
||||
* @param version
|
||||
*/
|
||||
public void setVersion(VCFHeaderVersion version) {
|
||||
if (!version.equals(this.mVersion)) this.stringRep = null;
|
||||
this.mVersion = version;
|
||||
}
|
||||
|
||||
public VCFHeaderVersion getVersion() {
|
||||
return mVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a string of a mapping pair for the target VCF version
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ public class VCFReaderUtils {
|
|||
* package protected so that the VCFReaderUtils can access this function
|
||||
*
|
||||
* @param headerStrings a list of header strings
|
||||
*
|
||||
* @param version Header version to parse
|
||||
* @return a VCF Header created from the list of stinrgs
|
||||
*/
|
||||
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
|
||||
|
|
@ -53,7 +53,7 @@ public class VCFReaderUtils {
|
|||
else {
|
||||
int equals = str.indexOf("=");
|
||||
if ( equals != -1 )
|
||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version));
|
||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,12 +25,11 @@ import java.util.*;
|
|||
*/
|
||||
public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||
|
||||
// a variant context flag for original allele strings
|
||||
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
private VCFHeader header = null;
|
||||
|
||||
private VCFHeaderVersion version = VCFHeaderVersion.VCF4_0;
|
||||
// used to convert the index of the alternate allele in genotypes to a integer index
|
||||
private static int ZERO_CHAR = (byte)'0';
|
||||
|
||||
|
|
@ -66,6 +65,9 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
|
||||
private int lineNo = 0;
|
||||
|
||||
// some classes need to transform the line before
|
||||
private LineTransform transformer = null;
|
||||
|
||||
/**
|
||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||
* @param reader the line reader to take header lines from
|
||||
|
|
@ -77,14 +79,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
|
||||
String line = "";
|
||||
try {
|
||||
boolean foundHeaderVersion = false;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
lineNo++;
|
||||
if (line.startsWith("##")) {
|
||||
if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv4" ) )
|
||||
throw new CodecLineParsingException("VCF4 codec can only parse VCF4 formated files. Your version line is " + line + ". If you want VCF3 parsing, use VCF as the rod type.");
|
||||
String[] lineFields = line.substring(2).split("=");
|
||||
if (lineFields.length == 2 &&
|
||||
VCFHeaderVersion.isVersionString(lineFields[1]) && VCFHeaderVersion.isFormatString(lineFields[0])) {
|
||||
foundHeaderVersion = true;
|
||||
this.version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||
}
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith("#")) {
|
||||
if (!foundHeaderVersion) {
|
||||
throw new CodecLineParsingException("We never saw a header line specifying VCF version");
|
||||
}
|
||||
return createHeader(headerStrings, line);
|
||||
}
|
||||
else {
|
||||
|
|
@ -107,7 +117,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
*/
|
||||
public int createHeader(List<String> headerStrings, String line) {
|
||||
headerStrings.add(line);
|
||||
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
|
||||
header = VCFReaderUtils.createHeader(headerStrings, this.version);
|
||||
|
||||
// load the parsing fields
|
||||
Set<VCFHeaderLine> headerLines = header.getMetaData();
|
||||
|
|
@ -226,12 +236,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
String[] split = str.split(",");
|
||||
for (String substring : split) {
|
||||
VCFHeaderLineType type = infoFields.get(key);
|
||||
objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring);
|
||||
// objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring);
|
||||
objects.add(substring);
|
||||
}
|
||||
value = objects;
|
||||
} else {
|
||||
VCFHeaderLineType type = infoFields.get(key);
|
||||
value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str;
|
||||
//value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str;
|
||||
value = str;
|
||||
}
|
||||
//System.out.printf("%s %s%n", key, value);
|
||||
} else {
|
||||
|
|
@ -269,17 +281,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
* @param qualString the quality string
|
||||
* @return return a double
|
||||
*/
|
||||
private static Double parseQual(String qualString) {
|
||||
// todo -- remove double once we deal with annoying VCFs from 1KG
|
||||
if ( qualString.equals(".") )
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
else {
|
||||
double q = Double.valueOf(qualString);
|
||||
if ( q == -1 )
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
else
|
||||
return Double.valueOf(qualString) / 10;
|
||||
}
|
||||
private Double parseQual(String qualString) {
|
||||
if (qualString.equals(VCFConstants.MISSING_VALUE_v4))
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
else {
|
||||
double q = Double.valueOf(qualString);
|
||||
if ( q == -1 )
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
else
|
||||
return Double.valueOf(qualString) / 10;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -346,7 +357,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
Set<String> fFields;
|
||||
|
||||
// a PASS is simple (no filters)
|
||||
if ( filterString.equals("PASS") ) {
|
||||
String passString = VCFConstants.PASSES_FILTERS_v3;
|
||||
if (this.version == VCFHeaderVersion.VCF4_0)
|
||||
passString = VCFConstants.PASSES_FILTERS_v4;
|
||||
|
||||
if ( filterString.equals(passString) ) {
|
||||
return null;
|
||||
}
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED)) {
|
||||
return null;
|
||||
}
|
||||
// else do we have the filter string cached?
|
||||
|
|
@ -373,6 +391,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
* parse out the VCF line
|
||||
*
|
||||
* @param parts the parts split up
|
||||
* @param parseGenotypes whether to parse genotypes or not
|
||||
* @return a variant context object
|
||||
*/
|
||||
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
||||
|
|
@ -398,7 +417,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
// find out our current location, and clip the alleles down to their minimum length
|
||||
Pair<GenomeLoc, List<Allele>> locAndAlleles;
|
||||
if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
attributes.put(ORIGINAL_ALLELE_LIST,alleles);
|
||||
if (this.version != VCFHeaderVersion.VCF4_0)
|
||||
throw new VCFParserException("Saw Indel/non SNP event on a VCF 3.3 or earlier file. Please convert file to VCF4.0 with VCFTools before using the GATK on it");
|
||||
locAndAlleles = clipAlleles(contig, pos, ref, alleles);
|
||||
} else {
|
||||
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
|
||||
|
|
@ -469,8 +489,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
if (nGTKeys >= 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
for (int i = 0; i < nGTKeys; i++) {
|
||||
if (i >= GTValueSplitSize)
|
||||
gtAttributes.put(genotypeKeyArray[i],".");
|
||||
if (i >= GTValueSplitSize) {
|
||||
if (genotypeKeyArray[i].equals("GQ"))
|
||||
GTQual = parseQual(VCFConstants.MISSING_VALUE_v4);
|
||||
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
||||
genotypeFilters = parseFilters(VCFConstants.MISSING_VALUE_v4);
|
||||
else
|
||||
gtAttributes.put(genotypeKeyArray[i],VCFConstants.MISSING_VALUE_v4);
|
||||
}
|
||||
else if (genotypeKeyArray[i].equals("GT"))
|
||||
if (i != 0)
|
||||
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
||||
|
|
@ -480,9 +506,11 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
GTQual = parseQual(GTValueArray[i]);
|
||||
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
||||
genotypeFilters = parseFilters(GTValueArray[i]);
|
||||
else
|
||||
else {
|
||||
if (this.version != VCFHeaderVersion.VCF4_0 && GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3))
|
||||
GTValueArray[i] = VCFConstants.MISSING_VALUE_v4;
|
||||
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
|
||||
|
||||
}
|
||||
}
|
||||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
|
|
@ -517,7 +545,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||
|
||||
// find the preceeding string common to all alleles and the reference
|
||||
/* //+
|
||||
boolean clipping = true;
|
||||
int forwardClipping = 0;
|
||||
while(clipping) {
|
||||
for (Allele a : unclippedAlleles) {
|
||||
if (a.length()-forwardClipping == 0)
|
||||
clipping = false;
|
||||
else if (a.getBases()[forwardClipping] != ref.getBytes()[forwardClipping])
|
||||
clipping = false;
|
||||
if (clipping) forwardClipping++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
//-
|
||||
*/ // find the preceeding string common to all alleles and the reference
|
||||
boolean clipping = true;
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
|
||||
|
|
@ -576,6 +619,19 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
return name;
|
||||
}
|
||||
|
||||
public static interface LineTransform {
|
||||
public String lineTransform(String line);
|
||||
}
|
||||
|
||||
public LineTransform getTransformer() {
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void setTransformer(LineTransform transformer) {
|
||||
this.transformer = transformer;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* set the name of this codec
|
||||
* @param name
|
||||
|
|
@ -583,4 +639,6 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
|||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
protected static String line = null;
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
VCF4Codec vcf4codec = new VCF4Codec();
|
||||
VCFCodec vcf4codec = new VCFCodec();
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -86,32 +86,25 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
|||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
|
||||
if (rod.getType().equals(vcf4codec.getClass())) {
|
||||
|
||||
try {
|
||||
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
||||
int lineNumber = vcf4codec.readHeader(lineReader);
|
||||
out.printf("Read %d header lines%n", lineNumber);
|
||||
try {
|
||||
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
||||
int lineNumber = vcf4codec.readHeader(lineReader);
|
||||
out.printf("Read %d header lines%n", lineNumber);
|
||||
|
||||
header = vcf4codec.getHeader(VCFHeader.class);
|
||||
}
|
||||
catch (FileNotFoundException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
} else {
|
||||
final VCFReader reader = new VCFReader(rod.getFile());
|
||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
samples.addAll(vcfSamples);
|
||||
reader.close();
|
||||
header = new VCFHeader(hInfo, samples);
|
||||
header = vcf4codec.getHeader(VCFHeader.class);
|
||||
}
|
||||
catch (FileNotFoundException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
|
||||
final Set<String> vcfSamples = header.getGenotypeSamples();
|
||||
samples.addAll(vcfSamples);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if ( header != null )
|
||||
header.setVersion(VCFHeaderVersion.VCF4_0);
|
||||
|
||||
vcfWriter.writeHeader(header);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,10 +86,10 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
if ( vc_eval == null || vc_eval.isFiltered() )
|
||||
return 0;
|
||||
|
||||
if (vc_eval.getType() != VariantContext.Type.SNP)
|
||||
if (!vc_eval.isSNP())
|
||||
return 0;
|
||||
|
||||
if (vc_eval.getAlleles().size()!= 2)
|
||||
if (!vc_eval.isBiallelic())
|
||||
return 0;
|
||||
|
||||
// output marker ID to Beagle input file
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
|||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -61,7 +62,7 @@ public class VCFUtils {
|
|||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
RMDTrack rod = source.getReferenceOrderedData();
|
||||
if ( rod.getRecordType().equals(VCFRecord.class) ) {
|
||||
if ( rod.getRecordType().equals(VariantContext.class) ) {
|
||||
fields.addAll(rod.getHeader(VCFHeader.class).getMetaData());
|
||||
}
|
||||
}
|
||||
|
|
@ -139,7 +140,7 @@ public class VCFUtils {
|
|||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) || line.getVersion() != VCFHeaderVersion.VCF4_0 ) // todo -- remove me when everything is 4
|
||||
if ( line.equals(other) )
|
||||
continue;
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
|
|
@ -173,7 +174,6 @@ public class VCFUtils {
|
|||
if ( logger != null ) logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||
}
|
||||
} else {
|
||||
line.setVersion(VCFHeaderVersion.VCF4_0); // todo -- remove this when we finally have vcf3/4 unified headers
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -201,9 +201,6 @@ public class VCFWriter {
|
|||
String paddingBases = "";
|
||||
String trailingBases = "";
|
||||
|
||||
ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");
|
||||
|
||||
|
||||
// search for reference allele and find trailing and padding at the end.
|
||||
// See first if all alleles have a base encoding (ie no deletions)
|
||||
// if so, add one common base to all alleles (reference at this location)
|
||||
|
|
@ -217,50 +214,12 @@ public class VCFWriter {
|
|||
refString = new String(a.getBases());
|
||||
}
|
||||
|
||||
if (originalAlleles != null) {
|
||||
// if original allele info was filled when reading from a VCF4 file,
|
||||
// determine whether there was a padding base(s) at the beginning and end.
|
||||
byte previousBase = 0;
|
||||
int cnt=0;
|
||||
boolean firstBaseCommonInAllAlleles = true;
|
||||
Allele originalReferenceAllele = null;
|
||||
for (Allele originalAllele : originalAlleles){
|
||||
// if first base of allele is common to all of them, there may have been a common base deleted from all
|
||||
byte firstBase = originalAllele.getBases()[0];
|
||||
if (cnt > 0) {
|
||||
if (firstBase != previousBase)
|
||||
firstBaseCommonInAllAlleles = false;
|
||||
}
|
||||
previousBase = firstBase;
|
||||
cnt++;
|
||||
|
||||
if (originalAllele.isReference())
|
||||
originalReferenceAllele = originalAllele;
|
||||
}
|
||||
|
||||
|
||||
numTrailingBases = (firstBaseCommonInAllAlleles)? 1:0;
|
||||
position -= numTrailingBases;
|
||||
|
||||
if (originalReferenceAllele == null)
|
||||
throw new IllegalStateException("At least one Allele must be reference");
|
||||
|
||||
String originalRef = new String(originalReferenceAllele.getBases());
|
||||
numPaddingBases = originalRef.length()-refString.length()-numTrailingBases;
|
||||
|
||||
if (numTrailingBases > 0) {
|
||||
trailingBases = originalRef.substring(0,numTrailingBases);
|
||||
}
|
||||
if (numPaddingBases > 0)
|
||||
paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length());
|
||||
}
|
||||
else {
|
||||
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
|
||||
if (!hasBasesInAllAlleles) {
|
||||
trailingBases = new String(refBases);
|
||||
numTrailingBases = 1;
|
||||
position--;
|
||||
}
|
||||
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
|
||||
if (!hasBasesInAllAlleles) {
|
||||
trailingBases = new String(refBases);
|
||||
numTrailingBases = 1;
|
||||
position--;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -295,6 +254,7 @@ public class VCFWriter {
|
|||
// this needs to be done in case all samples are no-calls
|
||||
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
}
|
||||
|
||||
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
||||
|
||||
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
|
||||
|
|
@ -378,10 +338,6 @@ public class VCFWriter {
|
|||
if ( key.equals("ID") )
|
||||
continue;
|
||||
|
||||
// Original alleles are not for reporting but only for internal bookkeeping
|
||||
if (key.equals("ORIGINAL_ALLELE_LIST"))
|
||||
continue;
|
||||
|
||||
String outputValue = formatVCFField(key, elt.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
|
|
@ -587,14 +543,21 @@ public class VCFWriter {
|
|||
Set<String> keys = new HashSet<String>();
|
||||
|
||||
boolean sawGoodQual = false;
|
||||
boolean sawGenotypeFilter = false;
|
||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
keys.addAll(g.getAttributes().keySet());
|
||||
if ( g.hasNegLog10PError() )
|
||||
sawGoodQual = true;
|
||||
if (g.isFiltered() && g.isCalled())
|
||||
sawGenotypeFilter = true;
|
||||
}
|
||||
|
||||
if ( sawGoodQual )
|
||||
keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
|
||||
|
||||
if (sawGenotypeFilter)
|
||||
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
|
||||
|
||||
return Utils.sorted(new ArrayList<String>(keys));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -205,20 +205,20 @@ public class VCF4UnitTest extends BaseTest {
|
|||
String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
||||
@Test
|
||||
public void testClippedAllelesAddedAsAnnotation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
// TODO - either modify or kill this test
|
||||
/* TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine);
|
||||
Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
||||
List<Allele> alleles = (List<Allele>)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST);
|
||||
Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size());
|
||||
Assert.assertTrue(alleles.get(0).basesMatch("GGG"));
|
||||
Assert.assertTrue(alleles.get(1).basesMatch("G"));
|
||||
}
|
||||
*/ }
|
||||
// test that when we don't clip the alleles, we don't see the annotation
|
||||
@Test
|
||||
public void testNoClippedAllelesNoAddedAsAnnotation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine);
|
||||
Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
||||
}
|
||||
|
||||
// test that we're getting the right genotype for a multi-base polymorphism
|
||||
|
|
|
|||
|
|
@ -70,11 +70,11 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testGenotypesToVCFUsingVCFInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("b423141ca600d581dc73e9b3dff4f782");
|
||||
md5.add("919eb499bfcc980a14825a0265e575e3");
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" -B variant,VCF," + validationDataLocation + "complexExample.vcf" +
|
||||
" -B variant,VCF," + validationDataLocation + "complexExample.vcf4" +
|
||||
" -T VariantsToVCF" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
|
|
|
|||
|
|
@ -16,9 +16,9 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
|
|||
executeTest("testFastaReference", spec1);
|
||||
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s",
|
||||
"-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s",
|
||||
1,
|
||||
Arrays.asList("3a48986c3832a768b478c3e95f994b0f"));
|
||||
Arrays.asList("a7bfc673eaa202a668c1424a933671ad"));
|
||||
executeTest("testFastaAlternateReferenceIndels", spec2);
|
||||
|
||||
WalkerTestSpec spec4 = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testClusteredSnps() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -window 10 -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("fa522e5b8c89c5a465ca03dd88bc2b8f"));
|
||||
Arrays.asList("2385975931cd06fca452655bebf5c379"));
|
||||
executeTest("test clustered SNPs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testMask() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -mask foo -B mask,VCF," + validationDataLocation + "vcfexample2.vcf -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("4d8a184417289c325410a5e545d9fc80"));
|
||||
Arrays.asList("6743efa09985206819adcf8eaf5ff936"));
|
||||
executeTest("test mask", spec);
|
||||
}
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testFilter1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("d48062f139d853a61e4f0bfb76bd695d"));
|
||||
Arrays.asList("e054f57d3794ce8a57cae92f16886cf0"));
|
||||
executeTest("test filter #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testFilter2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("791f89c6da7bdde79913754d9ddb72eb"));
|
||||
Arrays.asList("db09a1f7ff523087ea0f6a56f3febfe7"));
|
||||
executeTest("test filter #2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testFilterWithSeparateNames() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 70.0' --filterName FSF -filter 'FisherStrand == 1.4' -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("174caf880477f3edf68fe6722f227017"));
|
||||
Arrays.asList("edbd505d8d55b4ba71f99d7006871db0"));
|
||||
executeTest("test filter with separate names #2", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
||||
Arrays.asList("9ebe61dcb5112e7e745412d7767d101a"));
|
||||
Arrays.asList("e3f402fbbb6bbb4f60b1aa0549989d85"));
|
||||
executeTest("testConfidence2", spec2);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,9 +29,9 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testRealignerKnownsOnly() {
|
||||
String[] md5s = {"7084d4e543bc756730ab306768028530", "1091436c40d5ba557d85662999cc0c1d"};
|
||||
String[] md5s = {"83bc0c9a7d8872b552c6cbd994672c3b", "92bf331b672a03d63c26e4d9d3615f5b"};
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
|
||||
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
|
||||
2,
|
||||
Arrays.asList(md5s));
|
||||
executeTest("test realigner known indels only", spec);
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ import java.util.Arrays;
|
|||
public class PickSequenomProbesIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testProbes() {
|
||||
String testVCF = validationDataLocation + "complexExample.vcf";
|
||||
String testVCF = validationDataLocation + "complexExample.vcf4";
|
||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("0f356354a4a78ff62b2848431ec11262"));
|
||||
Arrays.asList("46d2acea95d36725db63af61ee963ce6"));
|
||||
executeTest("Test probes", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
|||
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("2e273d400b4b69e39c34e465b200b192"));
|
||||
Arrays.asList("2dab4630f40b76c0762de83fcbb60d09"));
|
||||
executeTest("Test SNPs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
|||
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("e15a63fc49ec25ebcae60a28a5f3f830"));
|
||||
Arrays.asList("845b9a15ac947052ddded5b79228e5ec"));
|
||||
executeTest("Test Indels", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "1f7adb28007d77e65c02112480f56663" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "a1acb90f0695cbe33c290403113ac3e1" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
|
|||
|
|
@ -35,9 +35,9 @@ public class RodSystemValidationIntegrationTest extends WalkerTest {
|
|||
public void testComplexVCFPileup() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" +
|
||||
" -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf"
|
||||
" -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4"
|
||||
, 1,
|
||||
Arrays.asList("c775c995c9fc09c66db51a694511d07b"));
|
||||
Arrays.asList("216b5de6f58be6cf3286ed5261772904"));
|
||||
executeTest("testComplexVCFPileup", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
|||
public void testVCFSelect1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||
Arrays.asList("8b358e0cfa35de022a37360a6f28a839"));
|
||||
Arrays.asList("8fd97a99174483920f84aaf54eeb9dd9"));
|
||||
executeTest("testVCFSelect1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
|||
public void testVCFSelect2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1,
|
||||
Arrays.asList("8e991b9d6d610c8f89c8557756fc8e34"));
|
||||
Arrays.asList("ecb61d798ff5e0b003d2986500bf4462"));
|
||||
executeTest("testVCFSelect2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -57,7 +57,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
|||
public void testVCFSelectOr() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||
Arrays.asList("7bd064c8d8bf5389fcd0b78a7c32b599"));
|
||||
Arrays.asList("7c23412a0abe28057a78cd532b752019"));
|
||||
executeTest("testVCFSelectOr", spec);
|
||||
}
|
||||
|
||||
|
|
@ -65,7 +65,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
|||
public void testVCFSelectAnd() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||
Arrays.asList("5af565836fa926feaa130715b93188bc"));
|
||||
Arrays.asList("71cc18348a9d5de7270613bedb79880a"));
|
||||
executeTest("testVCFSelectAnd", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,8 +1,13 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broad.tribble.util.AsciiLineReader;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.junit.Assert;
|
||||
|
|
@ -10,7 +15,9 @@ import org.junit.Test;
|
|||
import org.junit.BeforeClass;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
|
|
@ -34,30 +41,47 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyTest() {
|
||||
}
|
||||
|
||||
/** test, using the writer and reader, that we can output and input a VCF file without problems */
|
||||
//@Test
|
||||
@Test
|
||||
public void testBasicWriteAndRead() {
|
||||
VCFHeader header = createFakeHeader(metaData,additionalColumns);
|
||||
VCFWriter writer = new VCFWriter(fakeVCFFile);
|
||||
writer.writeHeader(header);
|
||||
writer.addRecord(createVCFRecord(header));
|
||||
writer.addRecord(createVCFRecord(header));
|
||||
writer.add(createVC(header),"A".getBytes());
|
||||
writer.add(createVC(header),"A".getBytes());
|
||||
writer.close();
|
||||
VCFReader reader = new VCFReader(fakeVCFFile);
|
||||
int counter = 0;
|
||||
// validate what we're reading in
|
||||
validateHeader(reader.getHeader());
|
||||
for (VCFRecord rec : reader) {
|
||||
counter++;
|
||||
VCFCodec reader = new VCFCodec();
|
||||
AsciiLineReader lineReader;
|
||||
|
||||
try {
|
||||
lineReader = new AsciiLineReader(new FileInputStream(fakeVCFFile));
|
||||
int lineNumber = reader.readHeader(lineReader);
|
||||
}
|
||||
Assert.assertEquals(2,counter);
|
||||
reader.close();
|
||||
new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete();
|
||||
//fakeVCFFile.delete();
|
||||
catch (FileNotFoundException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
|
||||
int counter = 0;
|
||||
|
||||
// validate what we're reading in
|
||||
validateHeader(reader.getHeader(VCFHeader.class));
|
||||
try {
|
||||
while(true) {
|
||||
String line = lineReader.readLine();
|
||||
if (line == null)
|
||||
break;
|
||||
|
||||
VariantContext vc = (VariantContext)reader.decode(line);
|
||||
counter++;
|
||||
}
|
||||
Assert.assertEquals(2,counter);
|
||||
new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete();
|
||||
fakeVCFFile.delete();
|
||||
}
|
||||
catch (IOException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -67,8 +91,8 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
* @return a fake VCF header
|
||||
*/
|
||||
public static VCFHeader createFakeHeader(Set<VCFHeaderLine> metaData, Set<String> additionalColumns) {
|
||||
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString(),VCFHeaderVersion.VCF3_3));
|
||||
metaData.add(new VCFHeaderLine("two", "2",VCFHeaderVersion.VCF3_3));
|
||||
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||
metaData.add(new VCFHeaderLine("two", "2"));
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
|
|
@ -80,23 +104,29 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
* @param header the VCF header
|
||||
* @return a VCFRecord
|
||||
*/
|
||||
private VCFRecord createVCFRecord(VCFHeader header) {
|
||||
List<VCFGenotypeEncoding> altBases = new ArrayList<VCFGenotypeEncoding>();
|
||||
altBases.add(new VCFGenotypeEncoding("C"));
|
||||
altBases.add(new VCFGenotypeEncoding("D1"));
|
||||
Map<String,String> infoFields = new HashMap<String,String>();
|
||||
infoFields.put("DP","50");
|
||||
private VariantContext createVC(VCFHeader header) {
|
||||
|
||||
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1",1);
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Set<String> filters = null;
|
||||
Map<String, String> attributes = new HashMap<String,String>();
|
||||
Map<String, Genotype> genotypes = new HashMap<String,Genotype>();
|
||||
|
||||
alleles.add(Allele.create("A",true));
|
||||
alleles.add(Allele.create("CC",false));
|
||||
alleles.add(Allele.create("-",false));
|
||||
|
||||
attributes.put("DP","50");
|
||||
for (String name : header.getGenotypeSamples()) {
|
||||
List<VCFGenotypeEncoding> myAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
myAlleles.add(new VCFGenotypeEncoding("C"));
|
||||
myAlleles.add(new VCFGenotypeEncoding("D1"));
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED);
|
||||
rec.setField("bb", "0");
|
||||
gt.add(rec);
|
||||
Map<String, String> gtattributes = new HashMap<String,String>();
|
||||
gtattributes.put("BB","1");
|
||||
Genotype gt = new Genotype(name,alleles.subList(1,2),0,null,gtattributes,true);
|
||||
|
||||
genotypes.put(name,gt);
|
||||
|
||||
}
|
||||
return new VCFRecord("A","chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt);
|
||||
return new VariantContext("RANDOM",loc, alleles, genotypes, 0, filters, attributes);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue