-- BCF2 is now a reference dependent codec so it can initialize the contigs in the case where the file doesn't have contigs in it
-- BCF2 writer can now work without the contig lines being in the header -- Made GenomeLocParser a final class
This commit is contained in:
parent
6301572009
commit
40431890be
|
|
@ -1,3 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.io.storage;
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
@Invariant({
|
||||
"logger != null",
|
||||
"contigInfo != null"})
|
||||
public class GenomeLocParser {
|
||||
public final class GenomeLocParser {
|
||||
private static Logger logger = Logger.getLogger(GenomeLocParser.class);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -86,12 +86,12 @@ public class GenomeLocParser {
|
|||
}
|
||||
|
||||
@Requires("contig != null")
|
||||
public synchronized boolean hasContig(final String contig) {
|
||||
public final synchronized boolean hasContig(final String contig) {
|
||||
return contig.equals(lastContig) || dict.getSequence(contig) != null;
|
||||
}
|
||||
|
||||
@Requires("index >= 0")
|
||||
public synchronized boolean hasContig(final int index) {
|
||||
public final synchronized boolean hasContig(final int index) {
|
||||
return lastIndex == index || dict.getSequence(index) != null;
|
||||
}
|
||||
|
||||
|
|
@ -125,12 +125,12 @@ public class GenomeLocParser {
|
|||
}
|
||||
|
||||
@Requires({"contig != null", "lastContig != null"})
|
||||
private synchronized boolean isCached(final String contig) {
|
||||
private final synchronized boolean isCached(final String contig) {
|
||||
return lastContig.equals(contig);
|
||||
}
|
||||
|
||||
@Requires({"lastIndex != -1", "index >= 0"})
|
||||
private synchronized boolean isCached(final int index) {
|
||||
private final synchronized boolean isCached(final int index) {
|
||||
return lastIndex == index;
|
||||
}
|
||||
|
||||
|
|
@ -144,7 +144,7 @@ public class GenomeLocParser {
|
|||
*/
|
||||
@Requires("contig != null || index >= 0")
|
||||
@Ensures("result != null")
|
||||
private synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
|
||||
private final synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
|
||||
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
|
||||
if ( rec == null ) {
|
||||
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
|
||||
|
|
@ -187,11 +187,11 @@ public class GenomeLocParser {
|
|||
*
|
||||
* @return True if the contig is valid. False otherwise.
|
||||
*/
|
||||
public boolean contigIsInDictionary(String contig) {
|
||||
public final boolean contigIsInDictionary(String contig) {
|
||||
return contig != null && contigInfo.hasContig(contig);
|
||||
}
|
||||
|
||||
public boolean indexIsInDictionary(final int index) {
|
||||
public final boolean indexIsInDictionary(final int index) {
|
||||
return index >= 0 && contigInfo.hasContig(index);
|
||||
}
|
||||
|
||||
|
|
@ -205,7 +205,7 @@ public class GenomeLocParser {
|
|||
*/
|
||||
@Ensures("result != null")
|
||||
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
||||
public SAMSequenceRecord getContigInfo(final String contig) {
|
||||
public final SAMSequenceRecord getContigInfo(final String contig) {
|
||||
if ( contig == null || ! contigIsInDictionary(contig) )
|
||||
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
|
||||
return contigInfo.getSequence(contig);
|
||||
|
|
@ -220,7 +220,7 @@ public class GenomeLocParser {
|
|||
*/
|
||||
@Ensures("result >= 0")
|
||||
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
||||
public int getContigIndex(final String contig) {
|
||||
public final int getContigIndex(final String contig) {
|
||||
return getContigInfo(contig).getSequenceIndex();
|
||||
}
|
||||
|
||||
|
|
@ -231,6 +231,14 @@ public class GenomeLocParser {
|
|||
return contigInfo.getSequenceIndex(contig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the master sequence dictionary used within this GenomeLocParser
|
||||
* @return
|
||||
*/
|
||||
public final SAMSequenceDictionary getContigs() {
|
||||
return contigInfo.dict;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Low-level creation functions
|
||||
|
|
|
|||
|
|
@ -24,12 +24,15 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -45,7 +48,7 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class BCF2Codec implements FeatureCodec<VariantContext> {
|
||||
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
|
||||
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||
private VCFHeader header = null;
|
||||
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||
|
|
@ -147,14 +150,18 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
}
|
||||
}
|
||||
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Reference dependence
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||
if ( dict.size() == 0 )
|
||||
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||
|
||||
return dict;
|
||||
@Override
|
||||
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
|
||||
// initialize contigNames to standard ones in reference
|
||||
for ( final SAMSequenceRecord contig : genomeLocParser.getContigs().getSequences() )
|
||||
contigNames.add(contig.getSequenceName());
|
||||
}
|
||||
|
||||
public boolean isSkippingGenotypes() {
|
||||
|
|
@ -412,4 +419,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
|
||||
}
|
||||
}
|
||||
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||
if ( dict.size() == 0 )
|
||||
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||
|
||||
return dict;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -257,15 +257,29 @@ public class VCFUtils {
|
|||
lines.add(line);
|
||||
}
|
||||
|
||||
final String assembly = getReferenceAssembly(referenceFile.getName());
|
||||
for ( SAMSequenceRecord contig : refDict.getSequences() )
|
||||
lines.add(getContigHeaderLine(contig, assembly));
|
||||
for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) )
|
||||
lines.add(contigLine);
|
||||
|
||||
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath()));
|
||||
return new VCFHeader(lines, oldHeader.getGenotypeSamples());
|
||||
}
|
||||
|
||||
private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
|
||||
/**
|
||||
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
|
||||
* @param refDict
|
||||
* @param referenceFile for assembly name. May be null
|
||||
* @return
|
||||
*/
|
||||
public final static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
|
||||
final File referenceFile) {
|
||||
final List<VCFContigHeaderLine> lines = new ArrayList<VCFContigHeaderLine>();
|
||||
final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null;
|
||||
for ( SAMSequenceRecord contig : refDict.getSequences() )
|
||||
lines.add(makeContigHeaderLine(contig, assembly));
|
||||
return lines;
|
||||
}
|
||||
|
||||
private final static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
|
||||
final Map<String, String> map = new LinkedHashMap<String, String>(3);
|
||||
map.put("ID", contig.getSequenceName());
|
||||
map.put("length", String.valueOf(contig.getSequenceLength()));
|
||||
|
|
|
|||
|
|
@ -60,11 +60,20 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
|
||||
for ( final VCFContigHeaderLine contig : contigLines )
|
||||
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeHeader(final VCFHeader header) {
|
||||
// create the config offsets map
|
||||
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
||||
if ( header.getContigLines().isEmpty() ) {
|
||||
logger.warn("No contig dictionary found in header, falling back to reference sequence dictionary");
|
||||
createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
|
||||
} else {
|
||||
createContigDictionary(header.getContigLines());
|
||||
}
|
||||
|
||||
// set up the map from dictionary string values -> offset
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
|
|
|||
|
|
@ -106,6 +106,13 @@ abstract class IndexingVariantContextWriter implements VariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference sequence dictionary used for the variant contexts being written
|
||||
*/
|
||||
public SAMSequenceDictionary getRefDict() {
|
||||
return refDict;
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue