From 55b756f1cc892808d140440b2c4f40922ac04f24 Mon Sep 17 00:00:00 2001 From: delangel Date: Fri, 16 Jul 2010 22:49:16 +0000 Subject: [PATCH] First step in major cleanup/redo of VCF functionality. Specifically, now: a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version. b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted. c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format. d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved. e) Several VCF 4.0 writer bugs are now solved. f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0. g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension. Pending issues: a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes. b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/org/broad/tribble/vcf/VCF3Codec.java | 124 ++++++++++++++++++ java/src/org/broad/tribble/vcf/VCFCodec.java | 109 +-------------- .../tribble/vcf/VCFCompoundHeaderLine.java | 4 +- .../tribble/vcf/VCFFilterHeaderLine.java | 15 +-- java/src/org/broad/tribble/vcf/VCFHeader.java | 22 +--- .../org/broad/tribble/vcf/VCFHeaderLine.java | 26 ---- .../org/broad/tribble/vcf/VCFReaderUtils.java | 4 +- .../gatk/refdata/features/vcf4/VCF4Codec.java | 108 +++++++++++---- .../walkers/VCF4WriterTestWalker.java | 33 ++--- .../walkers/ProduceBeagleInputWalker.java | 4 +- .../sting/utils/genotype/vcf/VCFUtils.java | 6 +- .../sting/utils/genotype/vcf/VCFWriter.java | 63 ++------- .../refdata/features/vcf4/VCF4UnitTest.java | 6 +- .../walkers/VariantsToVCFIntegrationTest.java | 4 +- ...astaAlternateReferenceIntegrationTest.java | 4 +- .../VariantFiltrationIntegrationTest.java | 10 +- .../UnifiedGenotyperIntegrationTest.java | 2 +- .../indels/IndelRealignerIntegrationTest.java | 4 +- .../PickSequenomProbesIntegrationTest.java | 4 +- ...nomValidationConverterIntegrationTest.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 2 +- .../RodSystemValidationIntegrationTest.java | 4 +- .../VariantSelectIntegrationTest.java | 8 +- .../utils/genotype/vcf/VCFWriterUnitTest.java | 96 +++++++++----- 24 files changed, 339 insertions(+), 327 deletions(-) create mode 100755 java/src/org/broad/tribble/vcf/VCF3Codec.java diff --git a/java/src/org/broad/tribble/vcf/VCF3Codec.java b/java/src/org/broad/tribble/vcf/VCF3Codec.java new file mode 100755 index 000000000..711b25c8d --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCF3Codec.java @@ -0,0 +1,124 @@ +package org.broad.tribble.vcf; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.LineReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Jul 13, 2010 + * Time: 3:57:01 PM + * To change this template use File | Settings | File Templates. + */ +public class VCF3Codec implements FeatureCodec { + + // we have to store the list of strings that make up the header until they're needed + private List headerStrings = new ArrayList(); + private VCFHeader header = null; + private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3; + + + // some classes need to transform the line before + private LineTransform transformer = null; + + /** + * Fast path to get the location of the Feature for indexing + * @param line the input line to decode + * @return + */ + public Feature decodeLoc(String line) { + return reallyDecode(line, true); + } + + /** + * Decode a line as a Feature. + * + * @param line + * + * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is + * a comment) + */ + public Feature decode(String line) { + return reallyDecode(line, false); + } + + private Feature reallyDecode(String line, boolean justLocationPlease ) { + // transform the line, if we have a transform to do + if (transformer != null) line = transformer.lineTransform(line); + if (line.startsWith("#")) + return null; + + // make a VCFRecord of the line and return it + VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease); + if ( ! justLocationPlease ) rec.setHeader(header); + return rec; + } + + /** + * Return the # of header lines for this file. We use this to parse out the header + * + * @return 0 + */ + public int readHeader(LineReader reader) { + String line = ""; + try { + while ((line = reader.readLine()) != null) { + if (line.startsWith("##")) { + headerStrings.add(line); + } + else if (line.startsWith("#")) { + headerStrings.add(line); + header = VCFReaderUtils.createHeader(headerStrings,version); + return headerStrings.size(); + } + else { + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + } + } catch (IOException e) { + throw new RuntimeException("IO Exception ", e); + } + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + /** + * @return VCFRecord.class + */ + public Class getFeatureType() { + return VCFRecord.class; + } + + public VCFHeader getHeader(Class clazz) throws ClassCastException { + if (!clazz.equals(VCFHeader.class)) + throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class); + return header; + } + + public static interface LineTransform { + public String lineTransform(String line); + } + + public LineTransform getTransformer() { + return transformer; + } + + public void setTransformer(LineTransform transformer) { + this.transformer = transformer; + } + + public VCFHeaderVersion getVersion() { + return version; + } + + public void setVersion(VCFHeaderVersion version) { + this.version = version; + } +} + diff --git a/java/src/org/broad/tribble/vcf/VCFCodec.java b/java/src/org/broad/tribble/vcf/VCFCodec.java index 0f6a378c0..510d09a6c 100644 --- a/java/src/org/broad/tribble/vcf/VCFCodec.java +++ b/java/src/org/broad/tribble/vcf/VCFCodec.java @@ -4,6 +4,7 @@ import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.exception.CodecLineParsingException; import org.broad.tribble.util.LineReader; +import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec; import java.io.IOException; import java.util.ArrayList; @@ -18,110 +19,4 @@ import java.util.List; * * The codec for VCF, which relies on VCFReaderUtils to do most of the processing */ -public class VCFCodec implements FeatureCodec { - - // we have to store the list of strings that make up the header until they're needed - private List headerStrings = new ArrayList(); - private VCFHeader header = null; - private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3; - - - // some classes need to transform the line before - private LineTransform transformer = null; - - /** - * Fast path to get the location of the Feature for indexing - * @param line the input line to decode - * @return - */ - public Feature decodeLoc(String line) { - return reallyDecode(line, true); - } - - /** - * Decode a line as a Feature. - * - * @param line - * - * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decode(String line) { - return reallyDecode(line, false); - } - - private Feature reallyDecode(String line, boolean justLocationPlease ) { - // transform the line, if we have a transform to do - if (transformer != null) line = transformer.lineTransform(line); - if (line.startsWith("#")) - return null; - - // make a VCFRecord of the line and return it - VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease); - if ( ! justLocationPlease ) rec.setHeader(header); - return rec; - } - - /** - * Return the # of header lines for this file. We use this to parse out the header - * - * @return 0 - */ - public int readHeader(LineReader reader) { - String line = ""; - try { - while ((line = reader.readLine()) != null) { - if (line.startsWith("##")) { - if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv3" ) ) - throw new CodecLineParsingException("VCF codec can only parse VCF3 formated files. Your version line is " + line + ". If you want to parse VCF4, use VCF4 use VCF as the rod type"); - headerStrings.add(line); - } - else if (line.startsWith("#")) { - headerStrings.add(line); - header = VCFReaderUtils.createHeader(headerStrings,version); - return headerStrings.size(); - } - else { - throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); - } - - } - } catch (IOException e) { - throw new RuntimeException("IO Exception ", e); - } - throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); - } - - /** - * @return VCFRecord.class - */ - public Class getFeatureType() { - return VCFRecord.class; - } - - public VCFHeader getHeader(Class clazz) throws ClassCastException { - if (!clazz.equals(VCFHeader.class)) - throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class); - return header; - } - - public static interface LineTransform { - public String lineTransform(String line); - } - - public LineTransform getTransformer() { - return transformer; - } - - public void setTransformer(LineTransform transformer) { - this.transformer = transformer; - } - - public VCFHeaderVersion getVersion() { - return version; - } - - public void setVersion(VCFHeaderVersion version) { - this.version = version; - } -} +public class VCFCodec extends VCF4Codec {} diff --git a/java/src/org/broad/tribble/vcf/VCFCompoundHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFCompoundHeaderLine.java index bb8d245ac..c78218b91 100644 --- a/java/src/org/broad/tribble/vcf/VCFCompoundHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFCompoundHeaderLine.java @@ -82,7 +82,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF } protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, VCFHeaderVersion version) { - super(lineType.toString(), "", version); + super(lineType.toString(), ""); this.name = name; this.count = count; this.type = type; @@ -98,7 +98,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * */ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), "", version); + super(lineType.toString(), ""); Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); name = mapping.get("ID"); count = version == VCFHeaderVersion.VCF4_0 ? diff --git a/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java index 9d9fd97eb..c3981d759 100755 --- a/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java @@ -34,22 +34,17 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader * @param version the vcf header version */ protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super("FILTER", "", version); + super("FILTER", ""); Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); name = mapping.get("ID"); description = mapping.get("Description"); } protected String toStringEncoding() { - if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2) - return String.format("FILTER=%s,\"%s\"", name, description); - else if (mVersion == VCFHeaderVersion.VCF4_0) { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.put("Description", description); - return "FILTER=" + VCFHeaderLine.toStringEncoding(map); - } - else throw new RuntimeException("Unsupported VCFVersion " + mVersion); + Map map = new LinkedHashMap(); + map.put("ID", name); + map.put("Description", description); + return "FILTER=" + VCFHeaderLine.toStringEncoding(map); } public boolean equals(Object o) { diff --git a/java/src/org/broad/tribble/vcf/VCFHeader.java b/java/src/org/broad/tribble/vcf/VCFHeader.java index 2da95ea66..fc084062b 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeader.java +++ b/java/src/org/broad/tribble/vcf/VCFHeader.java @@ -30,9 +30,6 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; - // our header version - private VCFHeaderVersion version; - /** do we have genotying data? */ private boolean hasGenotypingData = false; @@ -70,7 +67,6 @@ public class VCFHeader { List toRemove = new ArrayList(); for ( VCFHeaderLine line : mMetaData ) if ( VCFHeaderVersion.isFormatString(line.getKey())) { - version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey()); toRemove.add(line); } // remove old header lines for now, @@ -98,10 +94,7 @@ public class VCFHeader { */ public Set getMetaData() { Set lines = new LinkedHashSet(); - if (version == null) - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString())); - else - lines.add(new VCFHeaderLine(version.getFormatString(), version.getVersionString())); + lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); lines.addAll(mMetaData); return lines; } @@ -129,19 +122,6 @@ public class VCFHeader { return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); } - /** - * convert the header to a new VCF version - * @param version the version to convert to - */ - public void setVersion(VCFHeaderVersion version) { - if (version.equals(this.version)) - return; // we're all set, do nothing - - // store the new version, and update each of the header lines - this.version = version; - for (VCFHeaderLine line : mMetaData) - line.setVersion(version); - } } diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java index f3dadccf5..b9ff7948f 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java @@ -43,19 +43,6 @@ public class VCFHeaderLine implements Comparable { private String mKey = null; private String mValue = null; - protected VCFHeaderVersion mVersion = null; - - /** - * create a VCF header line - * - * @param key the key for this header line - * @param value the value for this header line - */ - public VCFHeaderLine(String key, String value, VCFHeaderVersion version) { - mKey = key; - mValue = value; - mVersion = version; - } /** * create a VCF header line @@ -66,7 +53,6 @@ public class VCFHeaderLine implements Comparable { public VCFHeaderLine(String key, String value) { mKey = key; mValue = value; - mVersion = VCFHeaderVersion.VCF3_3; } /** @@ -127,18 +113,6 @@ public class VCFHeaderLine implements Comparable { return toString().compareTo(other.toString()); } - /** - * set the version string, which resets the current stored string representation if the version changed - * @param version - */ - public void setVersion(VCFHeaderVersion version) { - if (!version.equals(this.mVersion)) this.stringRep = null; - this.mVersion = version; - } - - public VCFHeaderVersion getVersion() { - return mVersion; - } /** * create a string of a mapping pair for the target VCF version diff --git a/java/src/org/broad/tribble/vcf/VCFReaderUtils.java b/java/src/org/broad/tribble/vcf/VCFReaderUtils.java index 9efe455c8..9eec59c56 100644 --- a/java/src/org/broad/tribble/vcf/VCFReaderUtils.java +++ b/java/src/org/broad/tribble/vcf/VCFReaderUtils.java @@ -17,7 +17,7 @@ public class VCFReaderUtils { * package protected so that the VCFReaderUtils can access this function * * @param headerStrings a list of header strings - * + * @param version Header version to parse * @return a VCF Header created from the list of stinrgs */ public static VCFHeader createHeader(List headerStrings, VCFHeaderVersion version) { @@ -53,7 +53,7 @@ public class VCFReaderUtils { else { int equals = str.indexOf("="); if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version)); + metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 014a3b5c5..9b78d3b8b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -25,12 +25,11 @@ import java.util.*; */ public class VCF4Codec implements FeatureCodec, NameAwareCodec { - // a variant context flag for original allele strings - public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST"; // we have to store the list of strings that make up the header until they're needed private VCFHeader header = null; + private VCFHeaderVersion version = VCFHeaderVersion.VCF4_0; // used to convert the index of the alternate allele in genotypes to a integer index private static int ZERO_CHAR = (byte)'0'; @@ -66,6 +65,9 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { private int lineNo = 0; + // some classes need to transform the line before + private LineTransform transformer = null; + /** * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates * @param reader the line reader to take header lines from @@ -77,14 +79,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { String line = ""; try { + boolean foundHeaderVersion = false; while ((line = reader.readLine()) != null) { lineNo++; if (line.startsWith("##")) { - if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv4" ) ) - throw new CodecLineParsingException("VCF4 codec can only parse VCF4 formated files. Your version line is " + line + ". If you want VCF3 parsing, use VCF as the rod type."); + String[] lineFields = line.substring(2).split("="); + if (lineFields.length == 2 && + VCFHeaderVersion.isVersionString(lineFields[1]) && VCFHeaderVersion.isFormatString(lineFields[0])) { + foundHeaderVersion = true; + this.version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); + } headerStrings.add(line); } else if (line.startsWith("#")) { + if (!foundHeaderVersion) { + throw new CodecLineParsingException("We never saw a header line specifying VCF version"); + } return createHeader(headerStrings, line); } else { @@ -107,7 +117,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { */ public int createHeader(List headerStrings, String line) { headerStrings.add(line); - header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0); + header = VCFReaderUtils.createHeader(headerStrings, this.version); // load the parsing fields Set headerLines = header.getMetaData(); @@ -226,12 +236,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { String[] split = str.split(","); for (String substring : split) { VCFHeaderLineType type = infoFields.get(key); - objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring); +// objects.add(type != null ? type.convert(substring,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : substring); + objects.add(substring); } value = objects; } else { VCFHeaderLineType type = infoFields.get(key); - value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str; + //value = type != null ? type.convert(str,VCFCompoundHeaderLine.SupportedHeaderLineType.INFO) : str; + value = str; } //System.out.printf("%s %s%n", key, value); } else { @@ -269,17 +281,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { * @param qualString the quality string * @return return a double */ - private static Double parseQual(String qualString) { - // todo -- remove double once we deal with annoying VCFs from 1KG - if ( qualString.equals(".") ) - return VariantContext.NO_NEG_LOG_10PERROR; - else { - double q = Double.valueOf(qualString); - if ( q == -1 ) - return VariantContext.NO_NEG_LOG_10PERROR; - else - return Double.valueOf(qualString) / 10; - } + private Double parseQual(String qualString) { + if (qualString.equals(VCFConstants.MISSING_VALUE_v4)) + return VariantContext.NO_NEG_LOG_10PERROR; + else { + double q = Double.valueOf(qualString); + if ( q == -1 ) + return VariantContext.NO_NEG_LOG_10PERROR; + else + return Double.valueOf(qualString) / 10; + } } /** @@ -346,7 +357,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { Set fFields; // a PASS is simple (no filters) - if ( filterString.equals("PASS") ) { + String passString = VCFConstants.PASSES_FILTERS_v3; + if (this.version == VCFHeaderVersion.VCF4_0) + passString = VCFConstants.PASSES_FILTERS_v4; + + if ( filterString.equals(passString) ) { + return null; + } + if ( filterString.equals(VCFConstants.UNFILTERED)) { return null; } // else do we have the filter string cached? @@ -373,6 +391,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { * parse out the VCF line * * @param parts the parts split up + * @param parseGenotypes whether to parse genotypes or not * @return a variant context object */ private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) { @@ -398,7 +417,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { // find out our current location, and clip the alleles down to their minimum length Pair> locAndAlleles; if ( !isSingleNucleotideEvent(alleles) ) { - attributes.put(ORIGINAL_ALLELE_LIST,alleles); + if (this.version != VCFHeaderVersion.VCF4_0) + throw new VCFParserException("Saw Indel/non SNP event on a VCF 3.3 or earlier file. Please convert file to VCF4.0 with VCFTools before using the GATK on it"); locAndAlleles = clipAlleles(contig, pos, ref, alleles); } else { locAndAlleles = new Pair>(GenomeLocParser.createGenomeLoc(contig, pos), alleles); @@ -469,8 +489,14 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { if (nGTKeys >= 1) { gtAttributes = new HashMap(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { - if (i >= GTValueSplitSize) - gtAttributes.put(genotypeKeyArray[i],"."); + if (i >= GTValueSplitSize) { + if (genotypeKeyArray[i].equals("GQ")) + GTQual = parseQual(VCFConstants.MISSING_VALUE_v4); + else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here + genotypeFilters = parseFilters(VCFConstants.MISSING_VALUE_v4); + else + gtAttributes.put(genotypeKeyArray[i],VCFConstants.MISSING_VALUE_v4); + } else if (genotypeKeyArray[i].equals("GT")) if (i != 0) throw new VCFParserException("Saw GT at position " + i + ", it must be at the first position for genotypes. At location = " + locAndAlleles.first); @@ -480,9 +506,11 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { GTQual = parseQual(GTValueArray[i]); else if (genotypeKeyArray[i].equals("FT")) // deal with genotype filters here genotypeFilters = parseFilters(GTValueArray[i]); - else + else { + if (this.version != VCFHeaderVersion.VCF4_0 && GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3)) + GTValueArray[i] = VCFConstants.MISSING_VALUE_v4; gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]); - + } } // validate the format fields validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); @@ -517,7 +545,22 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { static Pair> clipAlleles(String contig, long position, String ref, List unclippedAlleles) { List newAlleleList = new ArrayList(); - // find the preceeding string common to all alleles and the reference +/* //+ + boolean clipping = true; + int forwardClipping = 0; + while(clipping) { + for (Allele a : unclippedAlleles) { + if (a.length()-forwardClipping == 0) + clipping = false; + else if (a.getBases()[forwardClipping] != ref.getBytes()[forwardClipping]) + clipping = false; + if (clipping) forwardClipping++; + } + + + } + //- + */ // find the preceeding string common to all alleles and the reference boolean clipping = true; for (Allele a : unclippedAlleles) if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) { @@ -576,6 +619,19 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { return name; } + public static interface LineTransform { + public String lineTransform(String line); + } + + public LineTransform getTransformer() { + return transformer; + } + + public void setTransformer(LineTransform transformer) { + this.transformer = transformer; + } + + /** * set the name of this codec * @param name @@ -583,4 +639,6 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { public void setName(String name) { this.name = name; } + + } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java index f6945c7d2..573668020 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java @@ -62,7 +62,7 @@ public class VCF4WriterTestWalker extends RodWalker { protected static String line = null; final TreeSet samples = new TreeSet(); - VCF4Codec vcf4codec = new VCF4Codec(); + VCFCodec vcf4codec = new VCFCodec(); /** @@ -86,32 +86,25 @@ public class VCF4WriterTestWalker extends RodWalker { for( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { - if (rod.getType().equals(vcf4codec.getClass())) { - try { - AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath())); - int lineNumber = vcf4codec.readHeader(lineReader); - out.printf("Read %d header lines%n", lineNumber); + try { + AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath())); + int lineNumber = vcf4codec.readHeader(lineReader); + out.printf("Read %d header lines%n", lineNumber); - header = vcf4codec.getHeader(VCFHeader.class); - } - catch (FileNotFoundException e ) { - throw new StingException(e.getMessage()); - } - } else { - final VCFReader reader = new VCFReader(rod.getFile()); - final Set vcfSamples = reader.getHeader().getGenotypeSamples(); - samples.addAll(vcfSamples); - reader.close(); - header = new VCFHeader(hInfo, samples); + header = vcf4codec.getHeader(VCFHeader.class); } + catch (FileNotFoundException e ) { + throw new StingException(e.getMessage()); + } + + final Set vcfSamples = header.getGenotypeSamples(); + samples.addAll(vcfSamples); + } } - if ( header != null ) - header.setVersion(VCFHeaderVersion.VCF4_0); - vcfWriter.writeHeader(header); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java index 6f8d0384c..1c7ba925f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java @@ -86,10 +86,10 @@ public class ProduceBeagleInputWalker extends RodWalker { if ( vc_eval == null || vc_eval.isFiltered() ) return 0; - if (vc_eval.getType() != VariantContext.Type.SNP) + if (!vc_eval.isSNP()) return 0; - if (vc_eval.getAlleles().size()!= 2) + if (!vc_eval.isBiallelic()) return 0; // output marker ID to Beagle input file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index bb5516e63..98c74c8d8 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -61,7 +62,7 @@ public class VCFUtils { List dataSources = toolkit.getRodDataSources(); for ( ReferenceOrderedDataSource source : dataSources ) { RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getRecordType().equals(VCFRecord.class) ) { + if ( rod.getRecordType().equals(VariantContext.class) ) { fields.addAll(rod.getHeader(VCFHeader.class).getMetaData()); } } @@ -139,7 +140,7 @@ public class VCFUtils { if ( map.containsKey(key) ) { VCFHeaderLine other = map.get(key); - if ( line.equals(other) || line.getVersion() != VCFHeaderVersion.VCF4_0 ) // todo -- remove me when everything is 4 + if ( line.equals(other) ) continue; else if ( ! line.getClass().equals(other.getClass()) ) throw new IllegalStateException("Incompatible header types: " + line + " " + other ); @@ -173,7 +174,6 @@ public class VCFUtils { if ( logger != null ) logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other)); } } else { - line.setVersion(VCFHeaderVersion.VCF4_0); // todo -- remove this when we finally have vcf3/4 unified headers map.put(key, line); //System.out.printf("Adding header line %s%n", line); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index ee49c95e9..51cfb044f 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -201,9 +201,6 @@ public class VCFWriter { String paddingBases = ""; String trailingBases = ""; - ArrayList originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST"); - - // search for reference allele and find trailing and padding at the end. // See first if all alleles have a base encoding (ie no deletions) // if so, add one common base to all alleles (reference at this location) @@ -217,50 +214,12 @@ public class VCFWriter { refString = new String(a.getBases()); } - if (originalAlleles != null) { - // if original allele info was filled when reading from a VCF4 file, - // determine whether there was a padding base(s) at the beginning and end. - byte previousBase = 0; - int cnt=0; - boolean firstBaseCommonInAllAlleles = true; - Allele originalReferenceAllele = null; - for (Allele originalAllele : originalAlleles){ - // if first base of allele is common to all of them, there may have been a common base deleted from all - byte firstBase = originalAllele.getBases()[0]; - if (cnt > 0) { - if (firstBase != previousBase) - firstBaseCommonInAllAlleles = false; - } - previousBase = firstBase; - cnt++; - - if (originalAllele.isReference()) - originalReferenceAllele = originalAllele; - } - - numTrailingBases = (firstBaseCommonInAllAlleles)? 1:0; - position -= numTrailingBases; - - if (originalReferenceAllele == null) - throw new IllegalStateException("At least one Allele must be reference"); - - String originalRef = new String(originalReferenceAllele.getBases()); - numPaddingBases = originalRef.length()-refString.length()-numTrailingBases; - - if (numTrailingBases > 0) { - trailingBases = originalRef.substring(0,numTrailingBases); - } - if (numPaddingBases > 0) - paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length()); - } - else { - // Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK. - if (!hasBasesInAllAlleles) { - trailingBases = new String(refBases); - numTrailingBases = 1; - position--; - } + // Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK. + if (!hasBasesInAllAlleles) { + trailingBases = new String(refBases); + numTrailingBases = 1; + position--; } @@ -295,6 +254,7 @@ public class VCFWriter { // this needs to be done in case all samples are no-calls vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); } + String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); List genotypeObjects = new ArrayList(vc.getGenotypes().size()); @@ -378,10 +338,6 @@ public class VCFWriter { if ( key.equals("ID") ) continue; - // Original alleles are not for reporting but only for internal bookkeeping - if (key.equals("ORIGINAL_ALLELE_LIST")) - continue; - String outputValue = formatVCFField(key, elt.getValue()); if ( outputValue != null ) infoFields.put(key, outputValue); @@ -587,14 +543,21 @@ public class VCFWriter { Set keys = new HashSet(); boolean sawGoodQual = false; + boolean sawGenotypeFilter = false; for ( Genotype g : vc.getGenotypes().values() ) { keys.addAll(g.getAttributes().keySet()); if ( g.hasNegLog10PError() ) sawGoodQual = true; + if (g.isFiltered() && g.isCalled()) + sawGenotypeFilter = true; } if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY); + + if (sawGenotypeFilter) + keys.add(VCFConstants.GENOTYPE_FILTER_KEY); + return Utils.sorted(new ArrayList(keys)); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 81aa76909..0a222e27f 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -205,20 +205,20 @@ public class VCF4UnitTest extends BaseTest { String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51"; @Test public void testClippedAllelesAddedAsAnnotation() { - TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + // TODO - either modify or kill this test +/* TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine); Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST)); List alleles = (List)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST); Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size()); Assert.assertTrue(alleles.get(0).basesMatch("GGG")); Assert.assertTrue(alleles.get(1).basesMatch("G")); - } + */ } // test that when we don't clip the alleles, we don't see the annotation @Test public void testNoClippedAllelesNoAddedAsAnnotation() { TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine); - Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST)); } // test that we're getting the right genotype for a multi-base polymorphism diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java index 696ee9526..8eabe0d6d 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java @@ -70,11 +70,11 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("b423141ca600d581dc73e9b3dff4f782"); + md5.add("919eb499bfcc980a14825a0265e575e3"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + oneKGLocation + "reference/human_b36_both.fasta" + - " -B variant,VCF," + validationDataLocation + "complexExample.vcf" + + " -B variant,VCF," + validationDataLocation + "complexExample.vcf4" + " -T VariantsToVCF" + " -o %s", 1, // just one output file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index 093a1fdaa..3124e805b 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -16,9 +16,9 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest { executeTest("testFastaReference", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", + "-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", 1, - Arrays.asList("3a48986c3832a768b478c3e95f994b0f")); + Arrays.asList("a7bfc673eaa202a668c1424a933671ad")); executeTest("testFastaAlternateReferenceIndels", spec2); WalkerTestSpec spec4 = new WalkerTestSpec( diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 1d48e0b7e..ac1643ced 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testClusteredSnps() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -window 10 -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fa522e5b8c89c5a465ca03dd88bc2b8f")); + Arrays.asList("2385975931cd06fca452655bebf5c379")); executeTest("test clustered SNPs", spec); } @@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -mask foo -B mask,VCF," + validationDataLocation + "vcfexample2.vcf -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4d8a184417289c325410a5e545d9fc80")); + Arrays.asList("6743efa09985206819adcf8eaf5ff936")); executeTest("test mask", spec); } @@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("d48062f139d853a61e4f0bfb76bd695d")); + Arrays.asList("e054f57d3794ce8a57cae92f16886cf0")); executeTest("test filter #1", spec); } @@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("791f89c6da7bdde79913754d9ddb72eb")); + Arrays.asList("db09a1f7ff523087ea0f6a56f3febfe7")); executeTest("test filter #2", spec); } @@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilterWithSeparateNames() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterName ABF -filter 'AlleleBalance < 70.0' --filterName FSF -filter 'FisherStrand == 1.4' -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("174caf880477f3edf68fe6722f227017")); + Arrays.asList("edbd505d8d55b4ba71f99d7006871db0")); executeTest("test filter with separate names #2", spec); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 31839e320..0a6f2e660 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("9ebe61dcb5112e7e745412d7767d101a")); + Arrays.asList("e3f402fbbb6bbb4f60b1aa0549989d85")); executeTest("testConfidence2", spec2); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index b441ff453..d9eb04bc1 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -29,9 +29,9 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealignerKnownsOnly() { - String[] md5s = {"7084d4e543bc756730ab306768028530", "1091436c40d5ba557d85662999cc0c1d"}; + String[] md5s = {"83bc0c9a7d8872b552c6cbd994672c3b", "92bf331b672a03d63c26e4d9d3615f5b"}; WalkerTestSpec spec = new WalkerTestSpec( - "-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly", + "-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly", 2, Arrays.asList(md5s)); executeTest("test realigner known indels only", spec); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java index 9a3ff8d3d..9722071c2 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java @@ -8,10 +8,10 @@ import java.util.Arrays; public class PickSequenomProbesIntegrationTest extends WalkerTest { @Test public void testProbes() { - String testVCF = validationDataLocation + "complexExample.vcf"; + String testVCF = validationDataLocation + "complexExample.vcf4"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("0f356354a4a78ff62b2848431ec11262")); + Arrays.asList("46d2acea95d36725db63af61ee963ce6")); executeTest("Test probes", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java index 22e084659..675aa0fa2 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java @@ -11,7 +11,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest { String testPedFile = validationDataLocation + "Sequenom_Test_File.txt"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s"; WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("2e273d400b4b69e39c34e465b200b192")); + Arrays.asList("2dab4630f40b76c0762de83fcbb60d09")); executeTest("Test SNPs", spec); } @@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest { String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s"; WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("e15a63fc49ec25ebcae60a28a5f3f830")); + Arrays.asList("845b9a15ac947052ddded5b79228e5ec")); executeTest("Test Indels", spec); } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index cdcd18fe0..65f70f7b8 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testVariantRecalibrator() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "1f7adb28007d77e65c02112480f56663" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "a1acb90f0695cbe33c290403113ac3e1" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java index 0f920c27f..7bd2dabeb 100644 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/validation/RodSystemValidationIntegrationTest.java @@ -35,9 +35,9 @@ public class RodSystemValidationIntegrationTest extends WalkerTest { public void testComplexVCFPileup() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" + - " -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf" + " -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4" , 1, - Arrays.asList("c775c995c9fc09c66db51a694511d07b")); + Arrays.asList("216b5de6f58be6cf3286ed5261772904")); executeTest("testComplexVCFPileup", spec); } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java index b6757da0b..e4f6ee26e 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java @@ -41,7 +41,7 @@ public class VariantSelectIntegrationTest extends WalkerTest { public void testVCFSelect1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("8b358e0cfa35de022a37360a6f28a839")); + Arrays.asList("8fd97a99174483920f84aaf54eeb9dd9")); executeTest("testVCFSelect1", spec); } @@ -49,7 +49,7 @@ public class VariantSelectIntegrationTest extends WalkerTest { public void testVCFSelect2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1, - Arrays.asList("8e991b9d6d610c8f89c8557756fc8e34")); + Arrays.asList("ecb61d798ff5e0b003d2986500bf4462")); executeTest("testVCFSelect2", spec); } @@ -57,7 +57,7 @@ public class VariantSelectIntegrationTest extends WalkerTest { public void testVCFSelectOr() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("7bd064c8d8bf5389fcd0b78a7c32b599")); + Arrays.asList("7c23412a0abe28057a78cd532b752019")); executeTest("testVCFSelectOr", spec); } @@ -65,7 +65,7 @@ public class VariantSelectIntegrationTest extends WalkerTest { public void testVCFSelectAnd() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("5af565836fa926feaa130715b93188bc")); + Arrays.asList("71cc18348a9d5de7270613bedb79880a")); executeTest("testVCFSelectAnd", spec); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index a2c2b4027..a6224974c 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -1,8 +1,13 @@ package org.broadinstitute.sting.utils.genotype.vcf; +import org.broad.tribble.util.AsciiLineReader; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; import org.junit.Assert; @@ -10,7 +15,9 @@ import org.junit.Test; import org.junit.BeforeClass; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.*; import net.sf.picard.reference.IndexedFastaSequenceFile; @@ -34,30 +41,47 @@ public class VCFWriterUnitTest extends BaseTest { GenomeLocParser.setupRefContigOrdering(seq); } - @Test - public void emptyTest() { - } - /** test, using the writer and reader, that we can output and input a VCF file without problems */ - //@Test + @Test public void testBasicWriteAndRead() { VCFHeader header = createFakeHeader(metaData,additionalColumns); VCFWriter writer = new VCFWriter(fakeVCFFile); writer.writeHeader(header); - writer.addRecord(createVCFRecord(header)); - writer.addRecord(createVCFRecord(header)); + writer.add(createVC(header),"A".getBytes()); + writer.add(createVC(header),"A".getBytes()); writer.close(); - VCFReader reader = new VCFReader(fakeVCFFile); - int counter = 0; - // validate what we're reading in - validateHeader(reader.getHeader()); - for (VCFRecord rec : reader) { - counter++; + VCFCodec reader = new VCFCodec(); + AsciiLineReader lineReader; + + try { + lineReader = new AsciiLineReader(new FileInputStream(fakeVCFFile)); + int lineNumber = reader.readHeader(lineReader); } - Assert.assertEquals(2,counter); - reader.close(); - new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete(); - //fakeVCFFile.delete(); + catch (FileNotFoundException e ) { + throw new StingException(e.getMessage()); + } + + int counter = 0; + + // validate what we're reading in + validateHeader(reader.getHeader(VCFHeader.class)); + try { + while(true) { + String line = lineReader.readLine(); + if (line == null) + break; + + VariantContext vc = (VariantContext)reader.decode(line); + counter++; + } + Assert.assertEquals(2,counter); + new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete(); + fakeVCFFile.delete(); + } + catch (IOException e ) { + throw new StingException(e.getMessage()); + } + } /** @@ -67,8 +91,8 @@ public class VCFWriterUnitTest extends BaseTest { * @return a fake VCF header */ public static VCFHeader createFakeHeader(Set metaData, Set additionalColumns) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString(),VCFHeaderVersion.VCF3_3)); - metaData.add(new VCFHeaderLine("two", "2",VCFHeaderVersion.VCF3_3)); + metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); + metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("FORMAT"); additionalColumns.add("extra1"); additionalColumns.add("extra2"); @@ -80,23 +104,29 @@ public class VCFWriterUnitTest extends BaseTest { * @param header the VCF header * @return a VCFRecord */ - private VCFRecord createVCFRecord(VCFHeader header) { - List altBases = new ArrayList(); - altBases.add(new VCFGenotypeEncoding("C")); - altBases.add(new VCFGenotypeEncoding("D1")); - Map infoFields = new HashMap(); - infoFields.put("DP","50"); + private VariantContext createVC(VCFHeader header) { - List gt = new ArrayList(); + GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1",1); + List alleles = new ArrayList(); + Set filters = null; + Map attributes = new HashMap(); + Map genotypes = new HashMap(); + + alleles.add(Allele.create("A",true)); + alleles.add(Allele.create("CC",false)); + alleles.add(Allele.create("-",false)); + + attributes.put("DP","50"); for (String name : header.getGenotypeSamples()) { - List myAlleles = new ArrayList(); - myAlleles.add(new VCFGenotypeEncoding("C")); - myAlleles.add(new VCFGenotypeEncoding("D1")); - VCFGenotypeRecord rec = new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED); - rec.setField("bb", "0"); - gt.add(rec); + Map gtattributes = new HashMap(); + gtattributes.put("BB","1"); + Genotype gt = new Genotype(name,alleles.subList(1,2),0,null,gtattributes,true); + + genotypes.put(name,gt); + } - return new VCFRecord("A","chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt); + return new VariantContext("RANDOM",loc, alleles, genotypes, 0, filters, attributes); + }