First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version. b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted. c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format. d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved. e) Several VCF 4.0 writer bugs are now solved. f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0. g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension. Pending issues: a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes. b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
75bea4881a
commit
55b756f1cc
|
|
@ -0,0 +1,124 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broad.tribble.exception.CodecLineParsingException;
|
||||||
|
import org.broad.tribble.util.LineReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: delangel
|
||||||
|
* Date: Jul 13, 2010
|
||||||
|
* Time: 3:57:01 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class VCF3Codec implements FeatureCodec {
|
||||||
|
|
||||||
|
// we have to store the list of strings that make up the header until they're needed
|
||||||
|
private List<String> headerStrings = new ArrayList<String>();
|
||||||
|
private VCFHeader header = null;
|
||||||
|
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
||||||
|
|
||||||
|
|
||||||
|
// some classes need to transform the line before
|
||||||
|
private LineTransform transformer = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fast path to get the location of the Feature for indexing
|
||||||
|
* @param line the input line to decode
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Feature decodeLoc(String line) {
|
||||||
|
return reallyDecode(line, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode a line as a Feature.
|
||||||
|
*
|
||||||
|
* @param line
|
||||||
|
*
|
||||||
|
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||||
|
* a comment)
|
||||||
|
*/
|
||||||
|
public Feature decode(String line) {
|
||||||
|
return reallyDecode(line, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
||||||
|
// transform the line, if we have a transform to do
|
||||||
|
if (transformer != null) line = transformer.lineTransform(line);
|
||||||
|
if (line.startsWith("#"))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// make a VCFRecord of the line and return it
|
||||||
|
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
||||||
|
if ( ! justLocationPlease ) rec.setHeader(header);
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the # of header lines for this file. We use this to parse out the header
|
||||||
|
*
|
||||||
|
* @return 0
|
||||||
|
*/
|
||||||
|
public int readHeader(LineReader reader) {
|
||||||
|
String line = "";
|
||||||
|
try {
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
if (line.startsWith("##")) {
|
||||||
|
headerStrings.add(line);
|
||||||
|
}
|
||||||
|
else if (line.startsWith("#")) {
|
||||||
|
headerStrings.add(line);
|
||||||
|
header = VCFReaderUtils.createHeader(headerStrings,version);
|
||||||
|
return headerStrings.size();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("IO Exception ", e);
|
||||||
|
}
|
||||||
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return VCFRecord.class
|
||||||
|
*/
|
||||||
|
public Class getFeatureType() {
|
||||||
|
return VCFRecord.class;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||||
|
if (!clazz.equals(VCFHeader.class))
|
||||||
|
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static interface LineTransform {
|
||||||
|
public String lineTransform(String line);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LineTransform getTransformer() {
|
||||||
|
return transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTransformer(LineTransform transformer) {
|
||||||
|
this.transformer = transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFHeaderVersion getVersion() {
|
||||||
|
return version;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVersion(VCFHeaderVersion version) {
|
||||||
|
this.version = version;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
|
||||||
import org.broad.tribble.FeatureCodec;
|
import org.broad.tribble.FeatureCodec;
|
||||||
import org.broad.tribble.exception.CodecLineParsingException;
|
import org.broad.tribble.exception.CodecLineParsingException;
|
||||||
import org.broad.tribble.util.LineReader;
|
import org.broad.tribble.util.LineReader;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -18,110 +19,4 @@ import java.util.List;
|
||||||
*
|
*
|
||||||
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
|
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
|
||||||
*/
|
*/
|
||||||
public class VCFCodec implements FeatureCodec {
|
public class VCFCodec extends VCF4Codec {}
|
||||||
|
|
||||||
// we have to store the list of strings that make up the header until they're needed
|
|
||||||
private List<String> headerStrings = new ArrayList<String>();
|
|
||||||
private VCFHeader header = null;
|
|
||||||
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
|
||||||
|
|
||||||
|
|
||||||
// some classes need to transform the line before
|
|
||||||
private LineTransform transformer = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fast path to get the location of the Feature for indexing
|
|
||||||
* @param line the input line to decode
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public Feature decodeLoc(String line) {
|
|
||||||
return reallyDecode(line, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decode a line as a Feature.
|
|
||||||
*
|
|
||||||
* @param line
|
|
||||||
*
|
|
||||||
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
|
||||||
* a comment)
|
|
||||||
*/
|
|
||||||
public Feature decode(String line) {
|
|
||||||
return reallyDecode(line, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
|
||||||
// transform the line, if we have a transform to do
|
|
||||||
if (transformer != null) line = transformer.lineTransform(line);
|
|
||||||
if (line.startsWith("#"))
|
|
||||||
return null;
|
|
||||||
|
|
||||||
// make a VCFRecord of the line and return it
|
|
||||||
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
|
||||||
if ( ! justLocationPlease ) rec.setHeader(header);
|
|
||||||
return rec;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the # of header lines for this file. We use this to parse out the header
|
|
||||||
*
|
|
||||||
* @return 0
|
|
||||||
*/
|
|
||||||
public int readHeader(LineReader reader) {
|
|
||||||
String line = "";
|
|
||||||
try {
|
|
||||||
while ((line = reader.readLine()) != null) {
|
|
||||||
if (line.startsWith("##")) {
|
|
||||||
if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv3" ) )
|
|
||||||
throw new CodecLineParsingException("VCF codec can only parse VCF3 formated files. Your version line is " + line + ". If you want to parse VCF4, use VCF4 use VCF as the rod type");
|
|
||||||
headerStrings.add(line);
|
|
||||||
}
|
|
||||||
else if (line.startsWith("#")) {
|
|
||||||
headerStrings.add(line);
|
|
||||||
header = VCFReaderUtils.createHeader(headerStrings,version);
|
|
||||||
return headerStrings.size();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("IO Exception ", e);
|
|
||||||
}
|
|
||||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return VCFRecord.class
|
|
||||||
*/
|
|
||||||
public Class getFeatureType() {
|
|
||||||
return VCFRecord.class;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
|
||||||
if (!clazz.equals(VCFHeader.class))
|
|
||||||
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
|
||||||
return header;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static interface LineTransform {
|
|
||||||
public String lineTransform(String line);
|
|
||||||
}
|
|
||||||
|
|
||||||
public LineTransform getTransformer() {
|
|
||||||
return transformer;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTransformer(LineTransform transformer) {
|
|
||||||
this.transformer = transformer;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VCFHeaderVersion getVersion() {
|
|
||||||
return version;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setVersion(VCFHeaderVersion version) {
|
|
||||||
this.version = version;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
||||||
}
|
}
|
||||||
|
|
||||||
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
|
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) {
|
||||||
super(lineType.toString(), "", version);
|
super(lineType.toString(), "");
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.count = count;
|
this.count = count;
|
||||||
this.type = type;
|
this.type = type;
|
||||||
|
|
@ -98,7 +98,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||||
super(lineType.toString(), "", version);
|
super(lineType.toString(), "");
|
||||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||||
name = mapping.get("ID");
|
name = mapping.get("ID");
|
||||||
count = version == VCFHeaderVersion.VCF4_0 ?
|
count = version == VCFHeaderVersion.VCF4_0 ?
|
||||||
|
|
|
||||||
|
|
@ -34,23 +34,18 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader
|
||||||
* @param version the vcf header version
|
* @param version the vcf header version
|
||||||
*/
|
*/
|
||||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||||
super("FILTER", "", version);
|
super("FILTER", "");
|
||||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||||
name = mapping.get("ID");
|
name = mapping.get("ID");
|
||||||
description = mapping.get("Description");
|
description = mapping.get("Description");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String toStringEncoding() {
|
protected String toStringEncoding() {
|
||||||
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
|
||||||
return String.format("FILTER=%s,\"%s\"", name, description);
|
|
||||||
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
|
||||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||||
map.put("ID", name);
|
map.put("ID", name);
|
||||||
map.put("Description", description);
|
map.put("Description", description);
|
||||||
return "FILTER=" + VCFHeaderLine.toStringEncoding(map);
|
return "FILTER=" + VCFHeaderLine.toStringEncoding(map);
|
||||||
}
|
}
|
||||||
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if ( !(o instanceof VCFFilterHeaderLine) )
|
if ( !(o instanceof VCFFilterHeaderLine) )
|
||||||
|
|
|
||||||
|
|
@ -30,9 +30,6 @@ public class VCFHeader {
|
||||||
// the header string indicator
|
// the header string indicator
|
||||||
public static final String HEADER_INDICATOR = "#";
|
public static final String HEADER_INDICATOR = "#";
|
||||||
|
|
||||||
// our header version
|
|
||||||
private VCFHeaderVersion version;
|
|
||||||
|
|
||||||
/** do we have genotying data? */
|
/** do we have genotying data? */
|
||||||
private boolean hasGenotypingData = false;
|
private boolean hasGenotypingData = false;
|
||||||
|
|
||||||
|
|
@ -70,7 +67,6 @@ public class VCFHeader {
|
||||||
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||||
for ( VCFHeaderLine line : mMetaData )
|
for ( VCFHeaderLine line : mMetaData )
|
||||||
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||||
version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey());
|
|
||||||
toRemove.add(line);
|
toRemove.add(line);
|
||||||
}
|
}
|
||||||
// remove old header lines for now,
|
// remove old header lines for now,
|
||||||
|
|
@ -98,10 +94,7 @@ public class VCFHeader {
|
||||||
*/
|
*/
|
||||||
public Set<VCFHeaderLine> getMetaData() {
|
public Set<VCFHeaderLine> getMetaData() {
|
||||||
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||||
if (version == null)
|
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString()));
|
|
||||||
else
|
|
||||||
lines.add(new VCFHeaderLine(version.getFormatString(), version.getVersionString()));
|
|
||||||
lines.addAll(mMetaData);
|
lines.addAll(mMetaData);
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
@ -129,19 +122,6 @@ public class VCFHeader {
|
||||||
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* convert the header to a new VCF version
|
|
||||||
* @param version the version to convert to
|
|
||||||
*/
|
|
||||||
public void setVersion(VCFHeaderVersion version) {
|
|
||||||
if (version.equals(this.version))
|
|
||||||
return; // we're all set, do nothing
|
|
||||||
|
|
||||||
// store the new version, and update each of the header lines
|
|
||||||
this.version = version;
|
|
||||||
for (VCFHeaderLine line : mMetaData)
|
|
||||||
line.setVersion(version);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,19 +43,6 @@ public class VCFHeaderLine implements Comparable {
|
||||||
private String mKey = null;
|
private String mKey = null;
|
||||||
private String mValue = null;
|
private String mValue = null;
|
||||||
|
|
||||||
protected VCFHeaderVersion mVersion = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create a VCF header line
|
|
||||||
*
|
|
||||||
* @param key the key for this header line
|
|
||||||
* @param value the value for this header line
|
|
||||||
*/
|
|
||||||
public VCFHeaderLine(String key, String value, VCFHeaderVersion version) {
|
|
||||||
mKey = key;
|
|
||||||
mValue = value;
|
|
||||||
mVersion = version;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a VCF header line
|
* create a VCF header line
|
||||||
|
|
@ -66,7 +53,6 @@ public class VCFHeaderLine implements Comparable {
|
||||||
public VCFHeaderLine(String key, String value) {
|
public VCFHeaderLine(String key, String value) {
|
||||||
mKey = key;
|
mKey = key;
|
||||||
mValue = value;
|
mValue = value;
|
||||||
mVersion = VCFHeaderVersion.VCF3_3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -127,18 +113,6 @@ public class VCFHeaderLine implements Comparable {
|
||||||
return toString().compareTo(other.toString());
|
return toString().compareTo(other.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* set the version string, which resets the current stored string representation if the version changed
|
|
||||||
* @param version
|
|
||||||
*/
|
|
||||||
public void setVersion(VCFHeaderVersion version) {
|
|
||||||
if (!version.equals(this.mVersion)) this.stringRep = null;
|
|
||||||
this.mVersion = version;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VCFHeaderVersion getVersion() {
|
|
||||||
return mVersion;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a string of a mapping pair for the target VCF version
|
* create a string of a mapping pair for the target VCF version
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ public class VCFReaderUtils {
|
||||||
* package protected so that the VCFReaderUtils can access this function
|
* package protected so that the VCFReaderUtils can access this function
|
||||||
*
|
*
|
||||||
* @param headerStrings a list of header strings
|
* @param headerStrings a list of header strings
|
||||||
*
|
* @param version Header version to parse
|
||||||
* @return a VCF Header created from the list of stinrgs
|
* @return a VCF Header created from the list of stinrgs
|
||||||
*/
|
*/
|
||||||
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
|
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
|
||||||
|
|
@ -53,7 +53,7 @@ public class VCFReaderUtils {
|
||||||
else {
|
else {
|
||||||
int equals = str.indexOf("=");
|
int equals = str.indexOf("=");
|
||||||
if ( equals != -1 )
|
if ( equals != -1 )
|
||||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version));
|
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,12 +25,11 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
|
|
||||||
// a variant context flag for original allele strings
|
|
||||||
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
|
|
||||||
|
|
||||||
// we have to store the list of strings that make up the header until they're needed
|
// we have to store the list of strings that make up the header until they're needed
|
||||||
private VCFHeader header = null;
|
private VCFHeader header = null;
|
||||||
|
|
||||||
|
private VCFHeaderVersion version = VCFHeaderVersion.VCF4_0;
|
||||||
// used to convert the index of the alternate allele in genotypes to a integer index
|
// used to convert the index of the alternate allele in genotypes to a integer index
|
||||||
private static int ZERO_CHAR = (byte)'0';
|
private static int ZERO_CHAR = (byte)'0';
|
||||||
|
|
||||||
|
|
@ -66,6 +65,9 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
|
|
||||||
private int lineNo = 0;
|
private int lineNo = 0;
|
||||||
|
|
||||||
|
// some classes need to transform the line before
|
||||||
|
private LineTransform transformer = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||||
* @param reader the line reader to take header lines from
|
* @param reader the line reader to take header lines from
|
||||||
|
|
@ -77,14 +79,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
|
|
||||||
String line = "";
|
String line = "";
|
||||||
try {
|
try {
|
||||||
|
boolean foundHeaderVersion = false;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
lineNo++;
|
lineNo++;
|
||||||
if (line.startsWith("##")) {
|
if (line.startsWith("##")) {
|
||||||
if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv4" ) )
|
String[] lineFields = line.substring(2).split("=");
|
||||||
throw new CodecLineParsingException("VCF4 codec can only parse VCF4 formated files. Your version line is " + line + ". If you want VCF3 parsing, use VCF as the rod type.");
|
if (lineFields.length == 2 &&
|
||||||
|
VCFHeaderVersion.isVersionString(lineFields[1]) && VCFHeaderVersion.isFormatString(lineFields[0])) {
|
||||||
|
foundHeaderVersion = true;
|
||||||
|
this.version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||||
|
}
|
||||||
headerStrings.add(line);
|
headerStrings.add(line);
|
||||||
}
|
}
|
||||||
else if (line.startsWith("#")) {
|
else if (line.startsWith("#")) {
|
||||||
|
if (!foundHeaderVersion) {
|
||||||
|
throw new CodecLineParsingException("We never saw a header line specifying VCF version");
|
||||||
|
}
|
||||||
return createHeader(headerStrings, line);
|
return createHeader(headerStrings, line);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
@ -107,7 +117,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
*/
|
*/
|
||||||
public int createHeader(List<String> headerStrings, String line) {
|
public int createHeader(List<String> headerStrings, String line) {
|
||||||
headerStrings.add(line);
|
headerStrings.add(line);
|
||||||
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
|
header = VCFReaderUtils.createHeader(headerStrings, this.version);
|
||||||
|
|
||||||
// load the parsing fields
|
// load the parsing fields
|
||||||
Set<VCFHeaderLine> headerLines = header.getMetaData();
|
Set<VCFHeaderLine> headerLines = header.getMetaData();
|
||||||
|
|
@ -226,12 +236,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
String[] split = str.split(",");
|
String[] split = str.split(",");
|
||||||
for (String substring : split) {
|
for (String substring : split) {
|
||||||
VCFHeaderLineType type = infoFields.get(key);
|
VCFHeaderLineType type = infoFields.get(key);
|
||||||
objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring);
|
// objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring);
|
||||||
|
objects.add(substring);
|
||||||
}
|
}
|
||||||
value = objects;
|
value = objects;
|
||||||
} else {
|
} else {
|
||||||
VCFHeaderLineType type = infoFields.get(key);
|
VCFHeaderLineType type = infoFields.get(key);
|
||||||
value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str;
|
//value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str;
|
||||||
|
value = str;
|
||||||
}
|
}
|
||||||
//System.out.printf("%s %s%n", key, value);
|
//System.out.printf("%s %s%n", key, value);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -269,9 +281,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
* @param qualString the quality string
|
* @param qualString the quality string
|
||||||
* @return return a double
|
* @return return a double
|
||||||
*/
|
*/
|
||||||
private static Double parseQual(String qualString) {
|
private Double parseQual(String qualString) {
|
||||||
// todo -- remove double once we deal with annoying VCFs from 1KG
|
if (qualString.equals(VCFConstants.MISSING_VALUE_v4))
|
||||||
if ( qualString.equals(".") )
|
|
||||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||||
else {
|
else {
|
||||||
double q = Double.valueOf(qualString);
|
double q = Double.valueOf(qualString);
|
||||||
|
|
@ -346,7 +357,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
Set<String> fFields;
|
Set<String> fFields;
|
||||||
|
|
||||||
// a PASS is simple (no filters)
|
// a PASS is simple (no filters)
|
||||||
if ( filterString.equals("PASS") ) {
|
String passString = VCFConstants.PASSES_FILTERS_v3;
|
||||||
|
if (this.version == VCFHeaderVersion.VCF4_0)
|
||||||
|
passString = VCFConstants.PASSES_FILTERS_v4;
|
||||||
|
|
||||||
|
if ( filterString.equals(passString) ) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if ( filterString.equals(VCFConstants.UNFILTERED)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
// else do we have the filter string cached?
|
// else do we have the filter string cached?
|
||||||
|
|
@ -373,6 +391,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
* parse out the VCF line
|
* parse out the VCF line
|
||||||
*
|
*
|
||||||
* @param parts the parts split up
|
* @param parts the parts split up
|
||||||
|
* @param parseGenotypes whether to parse genotypes or not
|
||||||
* @return a variant context object
|
* @return a variant context object
|
||||||
*/
|
*/
|
||||||
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
||||||
|
|
@ -398,7 +417,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
// find out our current location, and clip the alleles down to their minimum length
|
// find out our current location, and clip the alleles down to their minimum length
|
||||||
Pair<GenomeLoc, List<Allele>> locAndAlleles;
|
Pair<GenomeLoc, List<Allele>> locAndAlleles;
|
||||||
if ( !isSingleNucleotideEvent(alleles) ) {
|
if ( !isSingleNucleotideEvent(alleles) ) {
|
||||||
attributes.put(ORIGINAL_ALLELE_LIST,alleles);
|
if (this.version != VCFHeaderVersion.VCF4_0)
|
||||||
|
throw new VCFParserException("Saw Indel/non SNP event on a VCF 3.3 or earlier file. Please convert file to VCF4.0 with VCFTools before using the GATK on it");
|
||||||
locAndAlleles = clipAlleles(contig, pos, ref, alleles);
|
locAndAlleles = clipAlleles(contig, pos, ref, alleles);
|
||||||
} else {
|
} else {
|
||||||
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
|
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
|
||||||
|
|
@ -469,8 +489,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
if (nGTKeys >= 1) {
|
if (nGTKeys >= 1) {
|
||||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||||
for (int i = 0; i < nGTKeys; i++) {
|
for (int i = 0; i < nGTKeys; i++) {
|
||||||
if (i >= GTValueSplitSize)
|
if (i >= GTValueSplitSize) {
|
||||||
gtAttributes.put(genotypeKeyArray[i],".");
|
if (genotypeKeyArray[i].equals("GQ"))
|
||||||
|
GTQual = parseQual(VCFConstants.MISSING_VALUE_v4);
|
||||||
|
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
||||||
|
genotypeFilters = parseFilters(VCFConstants.MISSING_VALUE_v4);
|
||||||
|
else
|
||||||
|
gtAttributes.put(genotypeKeyArray[i],VCFConstants.MISSING_VALUE_v4);
|
||||||
|
}
|
||||||
else if (genotypeKeyArray[i].equals("GT"))
|
else if (genotypeKeyArray[i].equals("GT"))
|
||||||
if (i != 0)
|
if (i != 0)
|
||||||
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first);
|
||||||
|
|
@ -480,9 +506,11 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
GTQual = parseQual(GTValueArray[i]);
|
GTQual = parseQual(GTValueArray[i]);
|
||||||
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here
|
||||||
genotypeFilters = parseFilters(GTValueArray[i]);
|
genotypeFilters = parseFilters(GTValueArray[i]);
|
||||||
else
|
else {
|
||||||
|
if (this.version != VCFHeaderVersion.VCF4_0 && GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3))
|
||||||
|
GTValueArray[i] = VCFConstants.MISSING_VALUE_v4;
|
||||||
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
|
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// validate the format fields
|
// validate the format fields
|
||||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||||
|
|
@ -517,7 +545,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||||
|
|
||||||
// find the preceeding string common to all alleles and the reference
|
/* //+
|
||||||
|
boolean clipping = true;
|
||||||
|
int forwardClipping = 0;
|
||||||
|
while(clipping) {
|
||||||
|
for (Allele a : unclippedAlleles) {
|
||||||
|
if (a.length()-forwardClipping == 0)
|
||||||
|
clipping = false;
|
||||||
|
else if (a.getBases()[forwardClipping] != ref.getBytes()[forwardClipping])
|
||||||
|
clipping = false;
|
||||||
|
if (clipping) forwardClipping++;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
//-
|
||||||
|
*/ // find the preceeding string common to all alleles and the reference
|
||||||
boolean clipping = true;
|
boolean clipping = true;
|
||||||
for (Allele a : unclippedAlleles)
|
for (Allele a : unclippedAlleles)
|
||||||
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
|
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
|
||||||
|
|
@ -576,6 +619,19 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static interface LineTransform {
|
||||||
|
public String lineTransform(String line);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LineTransform getTransformer() {
|
||||||
|
return transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTransformer(LineTransform transformer) {
|
||||||
|
this.transformer = transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the name of this codec
|
* set the name of this codec
|
||||||
* @param name
|
* @param name
|
||||||
|
|
@ -583,4 +639,6 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
|
||||||
public void setName(String name) {
|
public void setName(String name) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
protected static String line = null;
|
protected static String line = null;
|
||||||
final TreeSet<String> samples = new TreeSet<String>();
|
final TreeSet<String> samples = new TreeSet<String>();
|
||||||
VCF4Codec vcf4codec = new VCF4Codec();
|
VCFCodec vcf4codec = new VCFCodec();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -86,7 +86,6 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
||||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||||
final RMDTrack rod = source.getReferenceOrderedData();
|
final RMDTrack rod = source.getReferenceOrderedData();
|
||||||
if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
|
if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
|
||||||
if (rod.getType().equals(vcf4codec.getClass())) {
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
||||||
|
|
@ -98,20 +97,14 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
||||||
catch (FileNotFoundException e ) {
|
catch (FileNotFoundException e ) {
|
||||||
throw new StingException(e.getMessage());
|
throw new StingException(e.getMessage());
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
final VCFReader reader = new VCFReader(rod.getFile());
|
final Set<String> vcfSamples = header.getGenotypeSamples();
|
||||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
|
||||||
samples.addAll(vcfSamples);
|
samples.addAll(vcfSamples);
|
||||||
reader.close();
|
|
||||||
header = new VCFHeader(hInfo, samples);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( header != null )
|
|
||||||
header.setVersion(VCFHeaderVersion.VCF4_0);
|
|
||||||
|
|
||||||
vcfWriter.writeHeader(header);
|
vcfWriter.writeHeader(header);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -86,10 +86,10 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
if ( vc_eval == null || vc_eval.isFiltered() )
|
if ( vc_eval == null || vc_eval.isFiltered() )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (vc_eval.getType() != VariantContext.Type.SNP)
|
if (!vc_eval.isSNP())
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (vc_eval.getAlleles().size()!= 2)
|
if (!vc_eval.isBiallelic())
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// output marker ID to Beagle input file
|
// output marker ID to Beagle input file
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
@ -61,7 +62,7 @@ public class VCFUtils {
|
||||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||||
RMDTrack rod = source.getReferenceOrderedData();
|
RMDTrack rod = source.getReferenceOrderedData();
|
||||||
if ( rod.getRecordType().equals(VCFRecord.class) ) {
|
if ( rod.getRecordType().equals(VariantContext.class) ) {
|
||||||
fields.addAll(rod.getHeader(VCFHeader.class).getMetaData());
|
fields.addAll(rod.getHeader(VCFHeader.class).getMetaData());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -139,7 +140,7 @@ public class VCFUtils {
|
||||||
|
|
||||||
if ( map.containsKey(key) ) {
|
if ( map.containsKey(key) ) {
|
||||||
VCFHeaderLine other = map.get(key);
|
VCFHeaderLine other = map.get(key);
|
||||||
if ( line.equals(other) || line.getVersion() != VCFHeaderVersion.VCF4_0 ) // todo -- remove me when everything is 4
|
if ( line.equals(other) )
|
||||||
continue;
|
continue;
|
||||||
else if ( ! line.getClass().equals(other.getClass()) )
|
else if ( ! line.getClass().equals(other.getClass()) )
|
||||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||||
|
|
@ -173,7 +174,6 @@ public class VCFUtils {
|
||||||
if ( logger != null ) logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
if ( logger != null ) logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
line.setVersion(VCFHeaderVersion.VCF4_0); // todo -- remove this when we finally have vcf3/4 unified headers
|
|
||||||
map.put(key, line);
|
map.put(key, line);
|
||||||
//System.out.printf("Adding header line %s%n", line);
|
//System.out.printf("Adding header line %s%n", line);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -201,9 +201,6 @@ public class VCFWriter {
|
||||||
String paddingBases = "";
|
String paddingBases = "";
|
||||||
String trailingBases = "";
|
String trailingBases = "";
|
||||||
|
|
||||||
ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");
|
|
||||||
|
|
||||||
|
|
||||||
// search for reference allele and find trailing and padding at the end.
|
// search for reference allele and find trailing and padding at the end.
|
||||||
// See first if all alleles have a base encoding (ie no deletions)
|
// See first if all alleles have a base encoding (ie no deletions)
|
||||||
// if so, add one common base to all alleles (reference at this location)
|
// if so, add one common base to all alleles (reference at this location)
|
||||||
|
|
@ -217,51 +214,13 @@ public class VCFWriter {
|
||||||
refString = new String(a.getBases());
|
refString = new String(a.getBases());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (originalAlleles != null) {
|
|
||||||
// if original allele info was filled when reading from a VCF4 file,
|
|
||||||
// determine whether there was a padding base(s) at the beginning and end.
|
|
||||||
byte previousBase = 0;
|
|
||||||
int cnt=0;
|
|
||||||
boolean firstBaseCommonInAllAlleles = true;
|
|
||||||
Allele originalReferenceAllele = null;
|
|
||||||
for (Allele originalAllele : originalAlleles){
|
|
||||||
// if first base of allele is common to all of them, there may have been a common base deleted from all
|
|
||||||
byte firstBase = originalAllele.getBases()[0];
|
|
||||||
if (cnt > 0) {
|
|
||||||
if (firstBase != previousBase)
|
|
||||||
firstBaseCommonInAllAlleles = false;
|
|
||||||
}
|
|
||||||
previousBase = firstBase;
|
|
||||||
cnt++;
|
|
||||||
|
|
||||||
if (originalAllele.isReference())
|
|
||||||
originalReferenceAllele = originalAllele;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
numTrailingBases = (firstBaseCommonInAllAlleles)? 1:0;
|
|
||||||
position -= numTrailingBases;
|
|
||||||
|
|
||||||
if (originalReferenceAllele == null)
|
|
||||||
throw new IllegalStateException("At least one Allele must be reference");
|
|
||||||
|
|
||||||
String originalRef = new String(originalReferenceAllele.getBases());
|
|
||||||
numPaddingBases = originalRef.length()-refString.length()-numTrailingBases;
|
|
||||||
|
|
||||||
if (numTrailingBases > 0) {
|
|
||||||
trailingBases = originalRef.substring(0,numTrailingBases);
|
|
||||||
}
|
|
||||||
if (numPaddingBases > 0)
|
|
||||||
paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
|
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
|
||||||
if (!hasBasesInAllAlleles) {
|
if (!hasBasesInAllAlleles) {
|
||||||
trailingBases = new String(refBases);
|
trailingBases = new String(refBases);
|
||||||
numTrailingBases = 1;
|
numTrailingBases = 1;
|
||||||
position--;
|
position--;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
for ( Allele a : vc.getAlleles() ) {
|
for ( Allele a : vc.getAlleles() ) {
|
||||||
|
|
@ -295,6 +254,7 @@ public class VCFWriter {
|
||||||
// this needs to be done in case all samples are no-calls
|
// this needs to be done in case all samples are no-calls
|
||||||
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||||
}
|
}
|
||||||
|
|
||||||
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
||||||
|
|
||||||
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
|
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
|
||||||
|
|
@ -378,10 +338,6 @@ public class VCFWriter {
|
||||||
if ( key.equals("ID") )
|
if ( key.equals("ID") )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Original alleles are not for reporting but only for internal bookkeeping
|
|
||||||
if (key.equals("ORIGINAL_ALLELE_LIST"))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
String outputValue = formatVCFField(key, elt.getValue());
|
String outputValue = formatVCFField(key, elt.getValue());
|
||||||
if ( outputValue != null )
|
if ( outputValue != null )
|
||||||
infoFields.put(key, outputValue);
|
infoFields.put(key, outputValue);
|
||||||
|
|
@ -587,14 +543,21 @@ public class VCFWriter {
|
||||||
Set<String> keys = new HashSet<String>();
|
Set<String> keys = new HashSet<String>();
|
||||||
|
|
||||||
boolean sawGoodQual = false;
|
boolean sawGoodQual = false;
|
||||||
|
boolean sawGenotypeFilter = false;
|
||||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||||
keys.addAll(g.getAttributes().keySet());
|
keys.addAll(g.getAttributes().keySet());
|
||||||
if ( g.hasNegLog10PError() )
|
if ( g.hasNegLog10PError() )
|
||||||
sawGoodQual = true;
|
sawGoodQual = true;
|
||||||
|
if (g.isFiltered() && g.isCalled())
|
||||||
|
sawGenotypeFilter = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( sawGoodQual )
|
if ( sawGoodQual )
|
||||||
keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
|
keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
|
||||||
|
|
||||||
|
if (sawGenotypeFilter)
|
||||||
|
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
|
||||||
|
|
||||||
return Utils.sorted(new ArrayList<String>(keys));
|
return Utils.sorted(new ArrayList<String>(keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -205,20 +205,20 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
||||||
@Test
|
@Test
|
||||||
public void testClippedAllelesAddedAsAnnotation() {
|
public void testClippedAllelesAddedAsAnnotation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
// TODO - either modify or kill this test
|
||||||
|
/* TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine);
|
VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine);
|
||||||
Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
||||||
List<Allele> alleles = (List<Allele>)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST);
|
List<Allele> alleles = (List<Allele>)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST);
|
||||||
Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size());
|
Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size());
|
||||||
Assert.assertTrue(alleles.get(0).basesMatch("GGG"));
|
Assert.assertTrue(alleles.get(0).basesMatch("GGG"));
|
||||||
Assert.assertTrue(alleles.get(1).basesMatch("G"));
|
Assert.assertTrue(alleles.get(1).basesMatch("G"));
|
||||||
}
|
*/ }
|
||||||
// test that when we don't clip the alleles, we don't see the annotation
|
// test that when we don't clip the alleles, we don't see the annotation
|
||||||
@Test
|
@Test
|
||||||
public void testNoClippedAllelesNoAddedAsAnnotation() {
|
public void testNoClippedAllelesNoAddedAsAnnotation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine);
|
VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine);
|
||||||
Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// test that we're getting the right genotype for a multi-base polymorphism
|
// test that we're getting the right genotype for a multi-base polymorphism
|
||||||
|
|
|
||||||
|
|
@ -70,11 +70,11 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testGenotypesToVCFUsingVCFInput() {
|
public void testGenotypesToVCFUsingVCFInput() {
|
||||||
List<String> md5 = new ArrayList<String>();
|
List<String> md5 = new ArrayList<String>();
|
||||||
md5.add("b423141ca600d581dc73e9b3dff4f782");
|
md5.add("919eb499bfcc980a14825a0265e575e3");
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
" -B variant,VCF," + validationDataLocation + "complexExample.vcf" +
|
" -B variant,VCF," + validationDataLocation + "complexExample.vcf4" +
|
||||||
" -T VariantsToVCF" +
|
" -T VariantsToVCF" +
|
||||||
" -o %s",
|
" -o %s",
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
|
|
|
||||||
|
|
@ -16,9 +16,9 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
|
||||||
executeTest("testFastaReference", spec1);
|
executeTest("testFastaReference", spec1);
|
||||||
|
|
||||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||||
"-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s",
|
"-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("3a48986c3832a768b478c3e95f994b0f"));
|
Arrays.asList("a7bfc673eaa202a668c1424a933671ad"));
|
||||||
executeTest("testFastaAlternateReferenceIndels", spec2);
|
executeTest("testFastaAlternateReferenceIndels", spec2);
|
||||||
|
|
||||||
WalkerTestSpec spec4 = new WalkerTestSpec(
|
WalkerTestSpec spec4 = new WalkerTestSpec(
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
public void testClusteredSnps() {
|
public void testClusteredSnps() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -window 10 -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
baseTestString() + " -window 10 -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||||
Arrays.asList("fa522e5b8c89c5a465ca03dd88bc2b8f"));
|
Arrays.asList("2385975931cd06fca452655bebf5c379"));
|
||||||
executeTest("test clustered SNPs", spec);
|
executeTest("test clustered SNPs", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
public void testMask() {
|
public void testMask() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -mask foo -B mask,VCF," + validationDataLocation + "vcfexample2.vcf -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
baseTestString() + " -mask foo -B mask,VCF," + validationDataLocation + "vcfexample2.vcf -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||||
Arrays.asList("4d8a184417289c325410a5e545d9fc80"));
|
Arrays.asList("6743efa09985206819adcf8eaf5ff936"));
|
||||||
executeTest("test mask", spec);
|
executeTest("test mask", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
public void testFilter1() {
|
public void testFilter1() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||||
Arrays.asList("d48062f139d853a61e4f0bfb76bd695d"));
|
Arrays.asList("e054f57d3794ce8a57cae92f16886cf0"));
|
||||||
executeTest("test filter #1", spec);
|
executeTest("test filter #1", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
public void testFilter2() {
|
public void testFilter2() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||||
Arrays.asList("791f89c6da7bdde79913754d9ddb72eb"));
|
Arrays.asList("db09a1f7ff523087ea0f6a56f3febfe7"));
|
||||||
executeTest("test filter #2", spec);
|
executeTest("test filter #2", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
public void testFilterWithSeparateNames() {
|
public void testFilterWithSeparateNames() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 70.0' --filterName FSF -filter 'FisherStrand == 1.4' -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 70.0' --filterName FSF -filter 'FisherStrand == 1.4' -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||||
Arrays.asList("174caf880477f3edf68fe6722f227017"));
|
Arrays.asList("edbd505d8d55b4ba71f99d7006871db0"));
|
||||||
executeTest("test filter with separate names #2", spec);
|
executeTest("test filter with separate names #2", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
||||||
Arrays.asList("9ebe61dcb5112e7e745412d7767d101a"));
|
Arrays.asList("e3f402fbbb6bbb4f60b1aa0549989d85"));
|
||||||
executeTest("testConfidence2", spec2);
|
executeTest("testConfidence2", spec2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,9 +29,9 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRealignerKnownsOnly() {
|
public void testRealignerKnownsOnly() {
|
||||||
String[] md5s = {"7084d4e543bc756730ab306768028530", "1091436c40d5ba557d85662999cc0c1d"};
|
String[] md5s = {"83bc0c9a7d8872b552c6cbd994672c3b", "92bf331b672a03d63c26e4d9d3615f5b"};
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
|
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
|
||||||
2,
|
2,
|
||||||
Arrays.asList(md5s));
|
Arrays.asList(md5s));
|
||||||
executeTest("test realigner known indels only", spec);
|
executeTest("test realigner known indels only", spec);
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ import java.util.Arrays;
|
||||||
public class PickSequenomProbesIntegrationTest extends WalkerTest {
|
public class PickSequenomProbesIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testProbes() {
|
public void testProbes() {
|
||||||
String testVCF = validationDataLocation + "complexExample.vcf";
|
String testVCF = validationDataLocation + "complexExample.vcf4";
|
||||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s";
|
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s";
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
Arrays.asList("0f356354a4a78ff62b2848431ec11262"));
|
Arrays.asList("46d2acea95d36725db63af61ee963ce6"));
|
||||||
executeTest("Test probes", spec);
|
executeTest("Test probes", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
||||||
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
||||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
Arrays.asList("2e273d400b4b69e39c34e465b200b192"));
|
Arrays.asList("2dab4630f40b76c0762de83fcbb60d09"));
|
||||||
executeTest("Test SNPs", spec);
|
executeTest("Test SNPs", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
||||||
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
||||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
Arrays.asList("e15a63fc49ec25ebcae60a28a5f3f830"));
|
Arrays.asList("845b9a15ac947052ddded5b79228e5ec"));
|
||||||
executeTest("Test Indels", spec);
|
executeTest("Test Indels", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testVariantRecalibrator() {
|
public void testVariantRecalibrator() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "1f7adb28007d77e65c02112480f56663" );
|
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "a1acb90f0695cbe33c290403113ac3e1" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String vcf = entry.getKey();
|
String vcf = entry.getKey();
|
||||||
|
|
|
||||||
|
|
@ -35,9 +35,9 @@ public class RodSystemValidationIntegrationTest extends WalkerTest {
|
||||||
public void testComplexVCFPileup() {
|
public void testComplexVCFPileup() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" +
|
baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" +
|
||||||
" -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf"
|
" -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4"
|
||||||
, 1,
|
, 1,
|
||||||
Arrays.asList("c775c995c9fc09c66db51a694511d07b"));
|
Arrays.asList("216b5de6f58be6cf3286ed5261772904"));
|
||||||
executeTest("testComplexVCFPileup", spec);
|
executeTest("testComplexVCFPileup", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
||||||
public void testVCFSelect1() {
|
public void testVCFSelect1() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||||
Arrays.asList("8b358e0cfa35de022a37360a6f28a839"));
|
Arrays.asList("8fd97a99174483920f84aaf54eeb9dd9"));
|
||||||
executeTest("testVCFSelect1", spec);
|
executeTest("testVCFSelect1", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -49,7 +49,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
||||||
public void testVCFSelect2() {
|
public void testVCFSelect2() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1,
|
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1,
|
||||||
Arrays.asList("8e991b9d6d610c8f89c8557756fc8e34"));
|
Arrays.asList("ecb61d798ff5e0b003d2986500bf4462"));
|
||||||
executeTest("testVCFSelect2", spec);
|
executeTest("testVCFSelect2", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -57,7 +57,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
||||||
public void testVCFSelectOr() {
|
public void testVCFSelectOr() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||||
Arrays.asList("7bd064c8d8bf5389fcd0b78a7c32b599"));
|
Arrays.asList("7c23412a0abe28057a78cd532b752019"));
|
||||||
executeTest("testVCFSelectOr", spec);
|
executeTest("testVCFSelectOr", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -65,7 +65,7 @@ public class VariantSelectIntegrationTest extends WalkerTest {
|
||||||
public void testVCFSelectAnd() {
|
public void testVCFSelectAnd() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1,
|
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1,
|
||||||
Arrays.asList("5af565836fa926feaa130715b93188bc"));
|
Arrays.asList("71cc18348a9d5de7270613bedb79880a"));
|
||||||
executeTest("testVCFSelectAnd", spec);
|
executeTest("testVCFSelectAnd", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,8 +1,13 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.AsciiLineReader;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
|
|
@ -10,7 +15,9 @@ import org.junit.Test;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
|
@ -34,30 +41,47 @@ public class VCFWriterUnitTest extends BaseTest {
|
||||||
GenomeLocParser.setupRefContigOrdering(seq);
|
GenomeLocParser.setupRefContigOrdering(seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void emptyTest() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/** test, using the writer and reader, that we can output and input a VCF file without problems */
|
/** test, using the writer and reader, that we can output and input a VCF file without problems */
|
||||||
//@Test
|
@Test
|
||||||
public void testBasicWriteAndRead() {
|
public void testBasicWriteAndRead() {
|
||||||
VCFHeader header = createFakeHeader(metaData,additionalColumns);
|
VCFHeader header = createFakeHeader(metaData,additionalColumns);
|
||||||
VCFWriter writer = new VCFWriter(fakeVCFFile);
|
VCFWriter writer = new VCFWriter(fakeVCFFile);
|
||||||
writer.writeHeader(header);
|
writer.writeHeader(header);
|
||||||
writer.addRecord(createVCFRecord(header));
|
writer.add(createVC(header),"A".getBytes());
|
||||||
writer.addRecord(createVCFRecord(header));
|
writer.add(createVC(header),"A".getBytes());
|
||||||
writer.close();
|
writer.close();
|
||||||
VCFReader reader = new VCFReader(fakeVCFFile);
|
VCFCodec reader = new VCFCodec();
|
||||||
|
AsciiLineReader lineReader;
|
||||||
|
|
||||||
|
try {
|
||||||
|
lineReader = new AsciiLineReader(new FileInputStream(fakeVCFFile));
|
||||||
|
int lineNumber = reader.readHeader(lineReader);
|
||||||
|
}
|
||||||
|
catch (FileNotFoundException e ) {
|
||||||
|
throw new StingException(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
|
|
||||||
// validate what we're reading in
|
// validate what we're reading in
|
||||||
validateHeader(reader.getHeader());
|
validateHeader(reader.getHeader(VCFHeader.class));
|
||||||
for (VCFRecord rec : reader) {
|
try {
|
||||||
|
while(true) {
|
||||||
|
String line = lineReader.readLine();
|
||||||
|
if (line == null)
|
||||||
|
break;
|
||||||
|
|
||||||
|
VariantContext vc = (VariantContext)reader.decode(line);
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
Assert.assertEquals(2,counter);
|
Assert.assertEquals(2,counter);
|
||||||
reader.close();
|
|
||||||
new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete();
|
new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete();
|
||||||
//fakeVCFFile.delete();
|
fakeVCFFile.delete();
|
||||||
|
}
|
||||||
|
catch (IOException e ) {
|
||||||
|
throw new StingException(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -67,8 +91,8 @@ public class VCFWriterUnitTest extends BaseTest {
|
||||||
* @return a fake VCF header
|
* @return a fake VCF header
|
||||||
*/
|
*/
|
||||||
public static VCFHeader createFakeHeader(Set<VCFHeaderLine> metaData, Set<String> additionalColumns) {
|
public static VCFHeader createFakeHeader(Set<VCFHeaderLine> metaData, Set<String> additionalColumns) {
|
||||||
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString(),VCFHeaderVersion.VCF3_3));
|
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||||
metaData.add(new VCFHeaderLine("two", "2",VCFHeaderVersion.VCF3_3));
|
metaData.add(new VCFHeaderLine("two", "2"));
|
||||||
additionalColumns.add("FORMAT");
|
additionalColumns.add("FORMAT");
|
||||||
additionalColumns.add("extra1");
|
additionalColumns.add("extra1");
|
||||||
additionalColumns.add("extra2");
|
additionalColumns.add("extra2");
|
||||||
|
|
@ -80,23 +104,29 @@ public class VCFWriterUnitTest extends BaseTest {
|
||||||
* @param header the VCF header
|
* @param header the VCF header
|
||||||
* @return a VCFRecord
|
* @return a VCFRecord
|
||||||
*/
|
*/
|
||||||
private VCFRecord createVCFRecord(VCFHeader header) {
|
private VariantContext createVC(VCFHeader header) {
|
||||||
List<VCFGenotypeEncoding> altBases = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
altBases.add(new VCFGenotypeEncoding("C"));
|
|
||||||
altBases.add(new VCFGenotypeEncoding("D1"));
|
|
||||||
Map<String,String> infoFields = new HashMap<String,String>();
|
|
||||||
infoFields.put("DP","50");
|
|
||||||
|
|
||||||
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
|
GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1",1);
|
||||||
|
List<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
Set<String> filters = null;
|
||||||
|
Map<String, String> attributes = new HashMap<String,String>();
|
||||||
|
Map<String, Genotype> genotypes = new HashMap<String,Genotype>();
|
||||||
|
|
||||||
|
alleles.add(Allele.create("A",true));
|
||||||
|
alleles.add(Allele.create("CC",false));
|
||||||
|
alleles.add(Allele.create("-",false));
|
||||||
|
|
||||||
|
attributes.put("DP","50");
|
||||||
for (String name : header.getGenotypeSamples()) {
|
for (String name : header.getGenotypeSamples()) {
|
||||||
List<VCFGenotypeEncoding> myAlleles = new ArrayList<VCFGenotypeEncoding>();
|
Map<String, String> gtattributes = new HashMap<String,String>();
|
||||||
myAlleles.add(new VCFGenotypeEncoding("C"));
|
gtattributes.put("BB","1");
|
||||||
myAlleles.add(new VCFGenotypeEncoding("D1"));
|
Genotype gt = new Genotype(name,alleles.subList(1,2),0,null,gtattributes,true);
|
||||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED);
|
|
||||||
rec.setField("bb", "0");
|
genotypes.put(name,gt);
|
||||||
gt.add(rec);
|
|
||||||
}
|
}
|
||||||
return new VCFRecord("A","chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt);
|
return new VariantContext("RANDOM",loc, alleles, genotypes, 0, filters, attributes);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue