Updates to VCF processing for better BCF processing
-- getMetaData now split into getMetaDataInSortedOrder() [old functionality] and getMetaDataInOriginalOrder() [according to the header order]. Important as BCF uses the order of elements in the header in the offsets to keys, and we were automatically sorting the BCF2 header which is out of order in samtools and the whole system was going crazy -- Updating GATK code to use the appropriate header function (this is why so many files have changed) -- BCF2 code was busted in not differentiating PASS from . from FILTER in VC (tests coming that will actually stress this) -- Bugfix for adding contig lines to BCF2 header dictionary -- VCFHeader metaData no longer sorted internally. The system now maintains the data in header order, and only sorts output as requested in API -- VCFWriter and BCF2Writer now explictly sort their header lines -- Don't allow filters to be added that are PASS in the contract
This commit is contained in:
parent
63f5262e45
commit
5b0ade67c8
|
|
@ -27,15 +27,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -69,7 +66,7 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
|
||||
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false);
|
||||
VCFHeader header = (VCFHeader)reader.getHeader();
|
||||
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
|
||||
for ( VCFHeaderLine headerLine : header.getMetaDataInInputOrder() ) {
|
||||
String key = headerLine.getKey();
|
||||
if ( headerLine instanceof VCFIDHeaderLine)
|
||||
key += "_" + ((VCFIDHeaderLine) headerLine).getID();
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ class MergeSegregatingAlternateAllelesVCFWriter implements VariantContextWriter
|
|||
if (useSingleSample != null) { // only want to output context for one sample
|
||||
Set<String> singSampSet = new TreeSet<String>();
|
||||
singSampSet.add(useSingleSample);
|
||||
header = new VCFHeader(header.getMetaData(), singSampSet);
|
||||
header = new VCFHeader(header.getMetaDataInSortedOrder(), singSampSet);
|
||||
}
|
||||
|
||||
innerWriter.writeHeader(header);
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
|
|||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
|
||||
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaData() : null, samples);
|
||||
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaDataInSortedOrder() : null, samples);
|
||||
writer.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
|
|||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
|
||||
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
|
||||
|
||||
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaData();
|
||||
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder();
|
||||
baseWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
|
||||
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200);
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
if ( vcfHeaders.containsKey(trackName) )
|
||||
metaData.addAll(vcfHeaders.get(trackName).getMetaData());
|
||||
metaData.addAll(vcfHeaders.get(trackName).getMetaDataInSortedOrder());
|
||||
if ( RECORD_ORIGINAL_LOCATION ) {
|
||||
metaData.add(new VCFInfoHeaderLine("OriginalChr", 1, VCFHeaderLineType.String, "Original contig name for the record"));
|
||||
metaData.add(new VCFInfoHeaderLine("OriginalStart", 1, VCFHeaderLineType.Integer, "Original start position for the record"));
|
||||
|
|
|
|||
|
|
@ -385,10 +385,14 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
if ( value == null )
|
||||
builder.unfiltered();
|
||||
else {
|
||||
if ( value instanceof Integer )
|
||||
if ( value instanceof Integer ) {
|
||||
// fast path for single integer result
|
||||
builder.filter(getDictionaryString((Integer)value));
|
||||
else {
|
||||
final String filterString = getDictionaryString((Integer)value);
|
||||
if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString))
|
||||
builder.passFilters();
|
||||
else
|
||||
builder.filter(filterString);
|
||||
} else {
|
||||
for ( final int offset : (List<Integer>)value )
|
||||
builder.filter(getDictionaryString(offset));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,10 +26,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
|
|
@ -84,8 +81,8 @@ public final class BCF2Utils {
|
|||
|
||||
boolean sawPASS = false;
|
||||
// set up the strings dictionary
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line instanceof VCFIDHeaderLine) {
|
||||
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) {
|
||||
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
|
||||
if ( ! seen.contains(idLine.getID())) {
|
||||
sawPASS = sawPASS || idLine.getID().equals(VCFConstants.PASSES_FILTERS_v4);
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
|
||||
this.version = version;
|
||||
|
||||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
Set<VCFHeaderLine> metaData = new LinkedHashSet<VCFHeaderLine>();
|
||||
Set<String> sampleNames = new LinkedHashSet<String>();
|
||||
int contigCounter = 0;
|
||||
// iterate over all the passed in strings
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ public class VCFHeader {
|
|||
}
|
||||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData = new TreeSet<VCFHeaderLine>();
|
||||
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
|
||||
|
|
@ -230,14 +230,22 @@ public class VCFHeader {
|
|||
}
|
||||
|
||||
/**
|
||||
* get the meta data, associated with this header
|
||||
* get the meta data, associated with this header, in sorted order
|
||||
*
|
||||
* @return a set of the meta data
|
||||
*/
|
||||
public Set<VCFHeaderLine> getMetaData() {
|
||||
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||
public Set<VCFHeaderLine> getMetaDataInInputOrder() {
|
||||
return makeGetMetaDataSet(mMetaData);
|
||||
}
|
||||
|
||||
public Set<VCFHeaderLine> getMetaDataInSortedOrder() {
|
||||
return makeGetMetaDataSet(new TreeSet<VCFHeaderLine>(mMetaData));
|
||||
}
|
||||
|
||||
private static Set<VCFHeaderLine> makeGetMetaDataSet(final Set<VCFHeaderLine> headerLinesInSomeOrder) {
|
||||
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
|
||||
lines.addAll(mMetaData);
|
||||
lines.addAll(headerLinesInSomeOrder);
|
||||
return Collections.unmodifiableSet(lines);
|
||||
}
|
||||
|
||||
|
|
@ -247,7 +255,7 @@ public class VCFHeader {
|
|||
* @return
|
||||
*/
|
||||
public VCFHeaderLine getMetaDataLine(final String key) {
|
||||
for (final VCFHeaderLine line: getMetaData()) {
|
||||
for (final VCFHeaderLine line: mMetaData) {
|
||||
if ( line.getKey().equals(key) )
|
||||
return line;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -60,8 +59,8 @@ public class VCFStandardHeaderLines {
|
|||
@Requires("header != null")
|
||||
@Ensures("result != null")
|
||||
public static VCFHeader repairStandardHeaderLines(final VCFHeader header) {
|
||||
final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaData().size());
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder().size());
|
||||
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFFormatHeaderLine ) {
|
||||
line = formatStandards.repair((VCFFormatHeaderLine) line);
|
||||
} else if ( line instanceof VCFInfoHeaderLine) {
|
||||
|
|
|
|||
|
|
@ -25,8 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
@ -34,7 +32,6 @@ import org.broad.tribble.Feature;
|
|||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -129,7 +126,7 @@ public class VCFUtils {
|
|||
if ( source.getRecordType().equals(VariantContext.class)) {
|
||||
VCFHeader header = (VCFHeader)source.getHeader();
|
||||
if ( header != null )
|
||||
fields.addAll(header.getMetaData());
|
||||
fields.addAll(header.getMetaDataInSortedOrder());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -160,7 +157,7 @@ public class VCFUtils {
|
|||
// todo -- needs to remove all version headers from sources and add its own VCF version line
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
for ( VCFHeaderLine line : source.getMetaDataInSortedOrder()) {
|
||||
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFIDHeaderLine )
|
||||
|
|
@ -250,7 +247,7 @@ public class VCFUtils {
|
|||
* @param refDict the SAM formatted reference sequence dictionary
|
||||
*/
|
||||
public final static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) {
|
||||
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaData(), referenceFile, refDict), oldHeader.getGenotypeSamples());
|
||||
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaDataInInputOrder(), referenceFile, refDict), oldHeader.getGenotypeSamples());
|
||||
}
|
||||
|
||||
public final static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict) {
|
||||
|
|
|
|||
|
|
@ -260,6 +260,7 @@ public class VariantContextBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
@Requires({"filter != null", "!filter.equals(\"PASS\")"})
|
||||
public VariantContextBuilder filter(final String filter) {
|
||||
if ( this.filters == null ) this.filters = new LinkedHashSet<String>(1);
|
||||
this.filters.add(filter);
|
||||
|
|
|
|||
|
|
@ -109,7 +109,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void writeHeader(final VCFHeader header) {
|
||||
public void writeHeader(VCFHeader header) {
|
||||
// make sure the header is sorted correctly
|
||||
header = new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples());
|
||||
|
||||
// create the config offsets map
|
||||
if ( header.getContigLines().isEmpty() ) {
|
||||
if ( ALLOW_MISSING_CONTIG_LINES ) {
|
||||
|
|
|
|||
|
|
@ -87,13 +87,13 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
final boolean doNotWriteGenotypes,
|
||||
final String versionLine,
|
||||
final String streamNameForError) {
|
||||
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
|
||||
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
|
||||
|
||||
try {
|
||||
// the file format field needs to be written first
|
||||
writer.write(versionLine + "\n");
|
||||
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
for ( VCFHeaderLine line : header.getMetaDataInSortedOrder() ) {
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey()) )
|
||||
continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
|
|
@ -26,7 +25,7 @@ public class VCFHeaderUnitTest extends BaseTest {
|
|||
private VCFHeader createHeader(String headerStr) {
|
||||
VCFCodec codec = new VCFCodec();
|
||||
VCFHeader header = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(headerStr))));
|
||||
Assert.assertEquals(header.getMetaData().size(), VCF4headerStringCount);
|
||||
Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount);
|
||||
return header;
|
||||
}
|
||||
|
||||
|
|
@ -98,7 +97,7 @@ public class VCFHeaderUnitTest extends BaseTest {
|
|||
} catch (IOException e) {
|
||||
Assert.fail("Unable to make a temp file!");
|
||||
}
|
||||
for (VCFHeaderLine line : header.getMetaData())
|
||||
for (VCFHeaderLine line : header.getMetaDataInSortedOrder())
|
||||
pw.println(line);
|
||||
pw.close();
|
||||
Assert.assertEquals(md5SumFile(myTempFile), md5sum);
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ public class VCFIntegrationTest extends WalkerTest {
|
|||
executeTest("Test writing samtools WEx BCF example", spec1);
|
||||
}
|
||||
|
||||
@Test(enabled = false) // TODO disabled because current BCF2 is 1 based
|
||||
@Test(enabled = false)
|
||||
public void testReadingSamtoolsWExBCFExample() {
|
||||
String testVCF = privateTestDir + "ex2.bcf";
|
||||
String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s ";
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ public class VariantContextTestProvider {
|
|||
for ( final VariantContext vc : vcs )
|
||||
if ( vc.hasGenotypes() )
|
||||
samples.addAll(vc.getSampleNames());
|
||||
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaData(), samples);
|
||||
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples);
|
||||
this.vcs = vcs;
|
||||
}
|
||||
|
||||
|
|
@ -885,12 +885,12 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
|
||||
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {
|
||||
Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size(), "No VCF header lines");
|
||||
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
|
||||
|
||||
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
|
||||
//Assert.assertEquals(actual.getMetaData(), expected.getMetaData());
|
||||
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaData());
|
||||
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaData());
|
||||
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
|
||||
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
|
||||
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
|
||||
for ( int i = 0; i < actualLines.size(); i++ ) {
|
||||
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,9 +39,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
|
@ -166,7 +163,7 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(header.getMetaData().size(), metaData.size());
|
||||
Assert.assertEquals(header.getMetaDataInSortedOrder().size(), metaData.size());
|
||||
index = 0;
|
||||
for (String key : header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
|
|
|
|||
Loading…
Reference in New Issue