-- BCF2 is now a reference dependent codec so it can initialize the contigs in the case where the file doesn't have contigs in it

-- BCF2 writer can now work without the contig lines being in the header
-- Made GenomeLocParser a final class
This commit is contained in:
Mark DePristo 2012-05-18 11:38:51 -04:00
parent 6301572009
commit 40431890be
6 changed files with 102 additions and 23 deletions

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.io.storage;
import net.sf.samtools.util.BlockCompressedOutputStream;

View File

@ -46,7 +46,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@Invariant({
"logger != null",
"contigInfo != null"})
public class GenomeLocParser {
public final class GenomeLocParser {
private static Logger logger = Logger.getLogger(GenomeLocParser.class);
// --------------------------------------------------------------------------------------------------------------
@ -86,12 +86,12 @@ public class GenomeLocParser {
}
@Requires("contig != null")
public synchronized boolean hasContig(final String contig) {
public final synchronized boolean hasContig(final String contig) {
return contig.equals(lastContig) || dict.getSequence(contig) != null;
}
@Requires("index >= 0")
public synchronized boolean hasContig(final int index) {
public final synchronized boolean hasContig(final int index) {
return lastIndex == index || dict.getSequence(index) != null;
}
@ -125,12 +125,12 @@ public class GenomeLocParser {
}
@Requires({"contig != null", "lastContig != null"})
private synchronized boolean isCached(final String contig) {
private final synchronized boolean isCached(final String contig) {
return lastContig.equals(contig);
}
@Requires({"lastIndex != -1", "index >= 0"})
private synchronized boolean isCached(final int index) {
private final synchronized boolean isCached(final int index) {
return lastIndex == index;
}
@ -144,7 +144,7 @@ public class GenomeLocParser {
*/
@Requires("contig != null || index >= 0")
@Ensures("result != null")
private synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
private final synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
if ( rec == null ) {
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
@ -187,11 +187,11 @@ public class GenomeLocParser {
*
* @return True if the contig is valid. False otherwise.
*/
public boolean contigIsInDictionary(String contig) {
public final boolean contigIsInDictionary(String contig) {
return contig != null && contigInfo.hasContig(contig);
}
public boolean indexIsInDictionary(final int index) {
public final boolean indexIsInDictionary(final int index) {
return index >= 0 && contigInfo.hasContig(index);
}
@ -205,7 +205,7 @@ public class GenomeLocParser {
*/
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
public SAMSequenceRecord getContigInfo(final String contig) {
public final SAMSequenceRecord getContigInfo(final String contig) {
if ( contig == null || ! contigIsInDictionary(contig) )
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
return contigInfo.getSequence(contig);
@ -220,7 +220,7 @@ public class GenomeLocParser {
*/
@Ensures("result >= 0")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
public int getContigIndex(final String contig) {
public final int getContigIndex(final String contig) {
return getContigInfo(contig).getSequenceIndex();
}
@ -231,6 +231,14 @@ public class GenomeLocParser {
return contigInfo.getSequenceIndex(contig);
}
/**
* Return the master sequence dictionary used within this GenomeLocParser
* @return
*/
public final SAMSequenceDictionary getContigs() {
return contigInfo.dict;
}
// --------------------------------------------------------------------------------------------------------------
//
// Low-level creation functions

View File

@ -24,12 +24,15 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -45,7 +48,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext> {
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
private final ArrayList<String> contigNames = new ArrayList<String>();
@ -147,14 +150,18 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
}
}
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// --------------------------------------------------------------------------------
//
// Reference dependence
//
// --------------------------------------------------------------------------------
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
return dict;
@Override
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
// initialize contigNames to standard ones in reference
for ( final SAMSequenceRecord contig : genomeLocParser.getContigs().getSequences() )
contigNames.add(contig.getSequenceName());
}
public boolean isSkippingGenotypes() {
@ -412,4 +419,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
}
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
return dict;
}
}

View File

@ -257,15 +257,29 @@ public class VCFUtils {
lines.add(line);
}
final String assembly = getReferenceAssembly(referenceFile.getName());
for ( SAMSequenceRecord contig : refDict.getSequences() )
lines.add(getContigHeaderLine(contig, assembly));
for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) )
lines.add(contigLine);
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath()));
return new VCFHeader(lines, oldHeader.getGenotypeSamples());
}
private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
/**
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
* @param refDict
* @param referenceFile for assembly name. May be null
* @return
*/
public final static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
final File referenceFile) {
final List<VCFContigHeaderLine> lines = new ArrayList<VCFContigHeaderLine>();
final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null;
for ( SAMSequenceRecord contig : refDict.getSequences() )
lines.add(makeContigHeaderLine(contig, assembly));
return lines;
}
private final static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
final Map<String, String> map = new LinkedHashMap<String, String>(3);
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));

View File

@ -60,11 +60,20 @@ class BCF2Writer extends IndexingVariantContextWriter {
//
// --------------------------------------------------------------------------------
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
for ( final VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), contig.getContigIndex());
}
@Override
public void writeHeader(final VCFHeader header) {
// create the config offsets map
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigDictionary.put(contig.getID(), contig.getContigIndex());
if ( header.getContigLines().isEmpty() ) {
logger.warn("No contig dictionary found in header, falling back to reference sequence dictionary");
createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
} else {
createContigDictionary(header.getContigLines());
}
// set up the map from dictionary string values -> offset
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);

View File

@ -106,6 +106,13 @@ abstract class IndexingVariantContextWriter implements VariantContextWriter {
}
}
/**
* @return the reference sequence dictionary used for the variant contexts being written
*/
public SAMSequenceDictionary getRefDict() {
return refDict;
}
/**
* add a record to the file
*