-- BCF2 is now a reference dependent codec so it can initialize the contigs in the case where the file doesn't have contigs in it
-- BCF2 writer can now work without the contig lines being in the header -- Made GenomeLocParser a final class
This commit is contained in:
parent
6301572009
commit
40431890be
|
|
@ -1,3 +1,27 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.io.storage;
|
package org.broadinstitute.sting.gatk.io.storage;
|
||||||
|
|
||||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
@Invariant({
|
@Invariant({
|
||||||
"logger != null",
|
"logger != null",
|
||||||
"contigInfo != null"})
|
"contigInfo != null"})
|
||||||
public class GenomeLocParser {
|
public final class GenomeLocParser {
|
||||||
private static Logger logger = Logger.getLogger(GenomeLocParser.class);
|
private static Logger logger = Logger.getLogger(GenomeLocParser.class);
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -86,12 +86,12 @@ public class GenomeLocParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Requires("contig != null")
|
@Requires("contig != null")
|
||||||
public synchronized boolean hasContig(final String contig) {
|
public final synchronized boolean hasContig(final String contig) {
|
||||||
return contig.equals(lastContig) || dict.getSequence(contig) != null;
|
return contig.equals(lastContig) || dict.getSequence(contig) != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Requires("index >= 0")
|
@Requires("index >= 0")
|
||||||
public synchronized boolean hasContig(final int index) {
|
public final synchronized boolean hasContig(final int index) {
|
||||||
return lastIndex == index || dict.getSequence(index) != null;
|
return lastIndex == index || dict.getSequence(index) != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -125,12 +125,12 @@ public class GenomeLocParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Requires({"contig != null", "lastContig != null"})
|
@Requires({"contig != null", "lastContig != null"})
|
||||||
private synchronized boolean isCached(final String contig) {
|
private final synchronized boolean isCached(final String contig) {
|
||||||
return lastContig.equals(contig);
|
return lastContig.equals(contig);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Requires({"lastIndex != -1", "index >= 0"})
|
@Requires({"lastIndex != -1", "index >= 0"})
|
||||||
private synchronized boolean isCached(final int index) {
|
private final synchronized boolean isCached(final int index) {
|
||||||
return lastIndex == index;
|
return lastIndex == index;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -144,7 +144,7 @@ public class GenomeLocParser {
|
||||||
*/
|
*/
|
||||||
@Requires("contig != null || index >= 0")
|
@Requires("contig != null || index >= 0")
|
||||||
@Ensures("result != null")
|
@Ensures("result != null")
|
||||||
private synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
|
private final synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
|
||||||
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
|
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
|
||||||
if ( rec == null ) {
|
if ( rec == null ) {
|
||||||
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
|
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
|
||||||
|
|
@ -187,11 +187,11 @@ public class GenomeLocParser {
|
||||||
*
|
*
|
||||||
* @return True if the contig is valid. False otherwise.
|
* @return True if the contig is valid. False otherwise.
|
||||||
*/
|
*/
|
||||||
public boolean contigIsInDictionary(String contig) {
|
public final boolean contigIsInDictionary(String contig) {
|
||||||
return contig != null && contigInfo.hasContig(contig);
|
return contig != null && contigInfo.hasContig(contig);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean indexIsInDictionary(final int index) {
|
public final boolean indexIsInDictionary(final int index) {
|
||||||
return index >= 0 && contigInfo.hasContig(index);
|
return index >= 0 && contigInfo.hasContig(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -205,7 +205,7 @@ public class GenomeLocParser {
|
||||||
*/
|
*/
|
||||||
@Ensures("result != null")
|
@Ensures("result != null")
|
||||||
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
||||||
public SAMSequenceRecord getContigInfo(final String contig) {
|
public final SAMSequenceRecord getContigInfo(final String contig) {
|
||||||
if ( contig == null || ! contigIsInDictionary(contig) )
|
if ( contig == null || ! contigIsInDictionary(contig) )
|
||||||
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
|
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
|
||||||
return contigInfo.getSequence(contig);
|
return contigInfo.getSequence(contig);
|
||||||
|
|
@ -220,7 +220,7 @@ public class GenomeLocParser {
|
||||||
*/
|
*/
|
||||||
@Ensures("result >= 0")
|
@Ensures("result >= 0")
|
||||||
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
|
||||||
public int getContigIndex(final String contig) {
|
public final int getContigIndex(final String contig) {
|
||||||
return getContigInfo(contig).getSequenceIndex();
|
return getContigInfo(contig).getSequenceIndex();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -231,6 +231,14 @@ public class GenomeLocParser {
|
||||||
return contigInfo.getSequenceIndex(contig);
|
return contigInfo.getSequenceIndex(contig);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the master sequence dictionary used within this GenomeLocParser
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public final SAMSequenceDictionary getContigs() {
|
||||||
|
return contigInfo.dict;
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Low-level creation functions
|
// Low-level creation functions
|
||||||
|
|
|
||||||
|
|
@ -24,12 +24,15 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broad.tribble.Feature;
|
import org.broad.tribble.Feature;
|
||||||
import org.broad.tribble.FeatureCodec;
|
import org.broad.tribble.FeatureCodec;
|
||||||
import org.broad.tribble.FeatureCodecHeader;
|
import org.broad.tribble.FeatureCodecHeader;
|
||||||
import org.broad.tribble.readers.AsciiLineReader;
|
import org.broad.tribble.readers.AsciiLineReader;
|
||||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
@ -45,7 +48,7 @@ import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
public class BCF2Codec implements FeatureCodec<VariantContext> {
|
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
|
||||||
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||||
private VCFHeader header = null;
|
private VCFHeader header = null;
|
||||||
private final ArrayList<String> contigNames = new ArrayList<String>();
|
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||||
|
|
@ -147,14 +150,18 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
// --------------------------------------------------------------------------------
|
||||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
//
|
||||||
|
// Reference dependence
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
|
||||||
if ( dict.size() == 0 )
|
|
||||||
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
|
||||||
|
|
||||||
return dict;
|
@Override
|
||||||
|
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
|
||||||
|
// initialize contigNames to standard ones in reference
|
||||||
|
for ( final SAMSequenceRecord contig : genomeLocParser.getContigs().getSequences() )
|
||||||
|
contigNames.add(contig.getSequenceName());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSkippingGenotypes() {
|
public boolean isSkippingGenotypes() {
|
||||||
|
|
@ -412,4 +419,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
||||||
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
|
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||||
|
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||||
|
|
||||||
|
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||||
|
if ( dict.size() == 0 )
|
||||||
|
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||||
|
|
||||||
|
return dict;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -257,15 +257,29 @@ public class VCFUtils {
|
||||||
lines.add(line);
|
lines.add(line);
|
||||||
}
|
}
|
||||||
|
|
||||||
final String assembly = getReferenceAssembly(referenceFile.getName());
|
for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) )
|
||||||
for ( SAMSequenceRecord contig : refDict.getSequences() )
|
lines.add(contigLine);
|
||||||
lines.add(getContigHeaderLine(contig, assembly));
|
|
||||||
|
|
||||||
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath()));
|
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath()));
|
||||||
return new VCFHeader(lines, oldHeader.getGenotypeSamples());
|
return new VCFHeader(lines, oldHeader.getGenotypeSamples());
|
||||||
}
|
}
|
||||||
|
|
||||||
private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
|
/**
|
||||||
|
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
|
||||||
|
* @param refDict
|
||||||
|
* @param referenceFile for assembly name. May be null
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public final static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
|
||||||
|
final File referenceFile) {
|
||||||
|
final List<VCFContigHeaderLine> lines = new ArrayList<VCFContigHeaderLine>();
|
||||||
|
final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null;
|
||||||
|
for ( SAMSequenceRecord contig : refDict.getSequences() )
|
||||||
|
lines.add(makeContigHeaderLine(contig, assembly));
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
|
||||||
final Map<String, String> map = new LinkedHashMap<String, String>(3);
|
final Map<String, String> map = new LinkedHashMap<String, String>(3);
|
||||||
map.put("ID", contig.getSequenceName());
|
map.put("ID", contig.getSequenceName());
|
||||||
map.put("length", String.valueOf(contig.getSequenceLength()));
|
map.put("length", String.valueOf(contig.getSequenceLength()));
|
||||||
|
|
|
||||||
|
|
@ -60,11 +60,20 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
|
||||||
|
for ( final VCFContigHeaderLine contig : contigLines )
|
||||||
|
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeHeader(final VCFHeader header) {
|
public void writeHeader(final VCFHeader header) {
|
||||||
// create the config offsets map
|
// create the config offsets map
|
||||||
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
if ( header.getContigLines().isEmpty() ) {
|
||||||
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
logger.warn("No contig dictionary found in header, falling back to reference sequence dictionary");
|
||||||
|
createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
|
||||||
|
} else {
|
||||||
|
createContigDictionary(header.getContigLines());
|
||||||
|
}
|
||||||
|
|
||||||
// set up the map from dictionary string values -> offset
|
// set up the map from dictionary string values -> offset
|
||||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||||
|
|
|
||||||
|
|
@ -106,6 +106,13 @@ abstract class IndexingVariantContextWriter implements VariantContextWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the reference sequence dictionary used for the variant contexts being written
|
||||||
|
*/
|
||||||
|
public SAMSequenceDictionary getRefDict() {
|
||||||
|
return refDict;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add a record to the file
|
* add a record to the file
|
||||||
*
|
*
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue