Updates to VCF processing for better BCF processing

-- getMetaData now split into getMetaDataInSortedOrder() [old functionality] and getMetaDataInOriginalOrder() [according to the header order].  Important as BCF uses the order of elements in the header in the offsets to keys, and we were automatically sorting the BCF2 header which is out of order in samtools and the whole system was going crazy
-- Updating GATK code to use the appropriate header function (this is why so many files have changed)
-- BCF2 code was busted in not differentiating PASS from . from FILTER in VC (tests coming that will actually stress this)
-- Bugfix for adding contig lines to BCF2 header dictionary
-- VCFHeader metaData no longer sorted internally.  The system now maintains the data in header order, and only sorts output as requested in API
-- VCFWriter and BCF2Writer now explictly sort their header lines
-- Don't allow filters to be added that are PASS in the contract
This commit is contained in:
Mark DePristo 2012-07-08 15:43:03 -07:00
parent 63f5262e45
commit 5b0ade67c8
18 changed files with 51 additions and 49 deletions

View File

@ -27,15 +27,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine;
import org.apache.log4j.Logger;
import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
@ -69,7 +66,7 @@ public class VCFDiffableReader implements DiffableReader {
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false);
VCFHeader header = (VCFHeader)reader.getHeader();
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
for ( VCFHeaderLine headerLine : header.getMetaDataInInputOrder() ) {
String key = headerLine.getKey();
if ( headerLine instanceof VCFIDHeaderLine)
key += "_" + ((VCFIDHeaderLine) headerLine).getID();

View File

@ -102,7 +102,7 @@ class MergeSegregatingAlternateAllelesVCFWriter implements VariantContextWriter
if (useSingleSample != null) { // only want to output context for one sample
Set<String> singSampSet = new TreeSet<String>();
singSampSet.add(useSingleSample);
header = new VCFHeader(header.getMetaData(), singSampSet);
header = new VCFHeader(header.getMetaDataInSortedOrder(), singSampSet);
}
innerWriter.writeHeader(header);

View File

@ -63,7 +63,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaData() : null, samples);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaDataInSortedOrder() : null, samples);
writer.writeHeader(vcfHeader);
}

View File

@ -92,7 +92,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaData();
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder();
baseWriter.writeHeader(new VCFHeader(headerLines, samples));
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200);

View File

@ -95,7 +95,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
if ( vcfHeaders.containsKey(trackName) )
metaData.addAll(vcfHeaders.get(trackName).getMetaData());
metaData.addAll(vcfHeaders.get(trackName).getMetaDataInSortedOrder());
if ( RECORD_ORIGINAL_LOCATION ) {
metaData.add(new VCFInfoHeaderLine("OriginalChr", 1, VCFHeaderLineType.String, "Original contig name for the record"));
metaData.add(new VCFInfoHeaderLine("OriginalStart", 1, VCFHeaderLineType.Integer, "Original start position for the record"));

View File

@ -385,10 +385,14 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
if ( value == null )
builder.unfiltered();
else {
if ( value instanceof Integer )
if ( value instanceof Integer ) {
// fast path for single integer result
builder.filter(getDictionaryString((Integer)value));
else {
final String filterString = getDictionaryString((Integer)value);
if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString))
builder.passFilters();
else
builder.filter(filterString);
} else {
for ( final int offset : (List<Integer>)value )
builder.filter(getDictionaryString(offset));
}

View File

@ -26,10 +26,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
@ -84,8 +81,8 @@ public final class BCF2Utils {
boolean sawPASS = false;
// set up the strings dictionary
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line instanceof VCFIDHeaderLine) {
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) {
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
if ( ! seen.contains(idLine.getID())) {
sawPASS = sawPASS || idLine.getID().equals(VCFConstants.PASSES_FILTERS_v4);

View File

@ -115,7 +115,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
this.version = version;
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<VCFHeaderLine> metaData = new LinkedHashSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>();
int contigCounter = 0;
// iterate over all the passed in strings

View File

@ -52,7 +52,7 @@ public class VCFHeader {
}
// the associated meta data
private final Set<VCFHeaderLine> mMetaData = new TreeSet<VCFHeaderLine>();
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
@ -230,14 +230,22 @@ public class VCFHeader {
}
/**
* get the meta data, associated with this header
* get the meta data, associated with this header, in sorted order
*
* @return a set of the meta data
*/
public Set<VCFHeaderLine> getMetaData() {
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
public Set<VCFHeaderLine> getMetaDataInInputOrder() {
return makeGetMetaDataSet(mMetaData);
}
public Set<VCFHeaderLine> getMetaDataInSortedOrder() {
return makeGetMetaDataSet(new TreeSet<VCFHeaderLine>(mMetaData));
}
private static Set<VCFHeaderLine> makeGetMetaDataSet(final Set<VCFHeaderLine> headerLinesInSomeOrder) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
lines.addAll(mMetaData);
lines.addAll(headerLinesInSomeOrder);
return Collections.unmodifiableSet(lines);
}
@ -247,7 +255,7 @@ public class VCFHeader {
* @return
*/
public VCFHeaderLine getMetaDataLine(final String key) {
for (final VCFHeaderLine line: getMetaData()) {
for (final VCFHeaderLine line: mMetaData) {
if ( line.getKey().equals(key) )
return line;
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -60,8 +59,8 @@ public class VCFStandardHeaderLines {
@Requires("header != null")
@Ensures("result != null")
public static VCFHeader repairStandardHeaderLines(final VCFHeader header) {
final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaData().size());
for ( VCFHeaderLine line : header.getMetaData() ) {
final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder().size());
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line instanceof VCFFormatHeaderLine ) {
line = formatStandards.repair((VCFFormatHeaderLine) line);
} else if ( line instanceof VCFInfoHeaderLine) {

View File

@ -25,8 +25,6 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
@ -34,7 +32,6 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
@ -129,7 +126,7 @@ public class VCFUtils {
if ( source.getRecordType().equals(VariantContext.class)) {
VCFHeader header = (VCFHeader)source.getHeader();
if ( header != null )
fields.addAll(header.getMetaData());
fields.addAll(header.getMetaDataInSortedOrder());
}
}
@ -160,7 +157,7 @@ public class VCFUtils {
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( VCFHeaderLine line : source.getMetaData()) {
for ( VCFHeaderLine line : source.getMetaDataInSortedOrder()) {
String key = line.getKey();
if ( line instanceof VCFIDHeaderLine )
@ -250,7 +247,7 @@ public class VCFUtils {
* @param refDict the SAM formatted reference sequence dictionary
*/
public final static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) {
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaData(), referenceFile, refDict), oldHeader.getGenotypeSamples());
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaDataInInputOrder(), referenceFile, refDict), oldHeader.getGenotypeSamples());
}
public final static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict) {

View File

@ -260,6 +260,7 @@ public class VariantContextBuilder {
return this;
}
@Requires({"filter != null", "!filter.equals(\"PASS\")"})
public VariantContextBuilder filter(final String filter) {
if ( this.filters == null ) this.filters = new LinkedHashSet<String>(1);
this.filters.add(filter);

View File

@ -109,7 +109,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
// --------------------------------------------------------------------------------
@Override
public void writeHeader(final VCFHeader header) {
public void writeHeader(VCFHeader header) {
// make sure the header is sorted correctly
header = new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples());
// create the config offsets map
if ( header.getContigLines().isEmpty() ) {
if ( ALLOW_MISSING_CONTIG_LINES ) {

View File

@ -87,13 +87,13 @@ class VCFWriter extends IndexingVariantContextWriter {
final boolean doNotWriteGenotypes,
final String versionLine,
final String streamNameForError) {
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
try {
// the file format field needs to be written first
writer.write(versionLine + "\n");
for ( VCFHeaderLine line : header.getMetaData() ) {
for ( VCFHeaderLine line : header.getMetaDataInSortedOrder() ) {
if ( VCFHeaderVersion.isFormatString(line.getKey()) )
continue;

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
@ -26,7 +25,7 @@ public class VCFHeaderUnitTest extends BaseTest {
private VCFHeader createHeader(String headerStr) {
VCFCodec codec = new VCFCodec();
VCFHeader header = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(headerStr))));
Assert.assertEquals(header.getMetaData().size(), VCF4headerStringCount);
Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount);
return header;
}
@ -98,7 +97,7 @@ public class VCFHeaderUnitTest extends BaseTest {
} catch (IOException e) {
Assert.fail("Unable to make a temp file!");
}
for (VCFHeaderLine line : header.getMetaData())
for (VCFHeaderLine line : header.getMetaDataInSortedOrder())
pw.println(line);
pw.close();
Assert.assertEquals(md5SumFile(myTempFile), md5sum);

View File

@ -59,7 +59,7 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest("Test writing samtools WEx BCF example", spec1);
}
@Test(enabled = false) // TODO disabled because current BCF2 is 1 based
@Test(enabled = false)
public void testReadingSamtoolsWExBCFExample() {
String testVCF = privateTestDir + "ex2.bcf";
String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s ";

View File

@ -106,7 +106,7 @@ public class VariantContextTestProvider {
for ( final VariantContext vc : vcs )
if ( vc.hasGenotypes() )
samples.addAll(vc.getSampleNames());
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaData(), samples);
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples);
this.vcs = vcs;
}
@ -885,12 +885,12 @@ public class VariantContextTestProvider {
}
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {
Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size(), "No VCF header lines");
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
//Assert.assertEquals(actual.getMetaData(), expected.getMetaData());
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaData());
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaData());
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
for ( int i = 0; i < actualLines.size(); i++ ) {
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
}

View File

@ -39,9 +39,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@ -166,7 +163,7 @@ public class VCFWriterUnitTest extends BaseTest {
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
index++;
}
Assert.assertEquals(header.getMetaData().size(), metaData.size());
Assert.assertEquals(header.getMetaDataInSortedOrder().size(), metaData.size());
index = 0;
for (String key : header.getGenotypeSamples()) {
Assert.assertTrue(additionalColumns.contains(key));