2012-04-17 23:45:32 +08:00
/ *
2013-01-11 06:04:08 +08:00
* Copyright ( c ) 2012 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2011-06-25 00:56:04 +08:00
2012-12-19 03:56:48 +08:00
package org.broadinstitute.variant.vcf ;
2011-06-25 00:56:04 +08:00
2012-12-19 03:56:48 +08:00
import org.broad.tribble.TribbleException ;
2011-06-25 00:56:04 +08:00
import org.broad.tribble.util.ParsingUtils ;
2013-01-09 03:45:50 +08:00
import org.broadinstitute.variant.utils.GeneralUtils ;
2011-06-25 00:56:04 +08:00
import java.util.* ;
/ * *
2012-06-14 09:49:22 +08:00
* This class is really a POS . It allows duplicate entries in the metadata ,
* stores header lines in lots of places , and all around f * cking sucks .
*
* todo - - clean this POS up
*
2011-06-25 00:56:04 +08:00
* @author aaron
* < p / >
* Class VCFHeader
* < p / >
* A class representing the VCF header
* /
public class VCFHeader {
// the mandatory header fields
public enum HEADER_FIELDS {
CHROM , POS , ID , REF , ALT , QUAL , FILTER , INFO
}
// the associated meta data
2012-07-09 06:43:03 +08:00
private final Set < VCFHeaderLine > mMetaData = new LinkedHashSet < VCFHeaderLine > ( ) ;
2012-08-16 02:36:06 +08:00
private final Map < String , VCFInfoHeaderLine > mInfoMetaData = new LinkedHashMap < String , VCFInfoHeaderLine > ( ) ;
private final Map < String , VCFFormatHeaderLine > mFormatMetaData = new LinkedHashMap < String , VCFFormatHeaderLine > ( ) ;
private final Map < String , VCFFilterHeaderLine > mFilterMetaData = new LinkedHashMap < String , VCFFilterHeaderLine > ( ) ;
private final Map < String , VCFHeaderLine > mOtherMetaData = new LinkedHashMap < String , VCFHeaderLine > ( ) ;
2012-05-10 05:55:42 +08:00
private final List < VCFContigHeaderLine > contigMetaData = new ArrayList < VCFContigHeaderLine > ( ) ;
2011-06-25 00:56:04 +08:00
// the list of auxillary tags
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
private final List < String > mGenotypeSampleNames = new ArrayList < String > ( ) ;
2011-06-25 00:56:04 +08:00
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##" ;
// the header string indicator
public static final String HEADER_INDICATOR = "#" ;
2012-04-17 23:45:32 +08:00
public static final String SOURCE_KEY = "source" ;
public static final String REFERENCE_KEY = "reference" ;
public static final String CONTIG_KEY = "contig" ;
public static final String INTERVALS_KEY = "intervals" ;
2013-01-17 01:43:15 +08:00
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals" ;
public static final String INTERVAL_MERGING_KEY = "interval_merging" ;
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule" ;
public static final String INTERVAL_PADDING_KEY = "interval_padding" ;
2012-04-17 23:45:32 +08:00
2011-06-25 00:56:04 +08:00
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true ;
2011-11-19 01:39:10 +08:00
// cache for efficient conversion of VCF -> VariantContext
2012-06-01 01:47:31 +08:00
private ArrayList < String > sampleNamesInOrder = null ;
private HashMap < String , Integer > sampleNameToOffset = null ;
2011-11-19 01:39:10 +08:00
2012-04-17 23:45:32 +08:00
private boolean writeEngineHeaders = true ;
private boolean writeCommandLine = true ;
2011-06-25 00:56:04 +08:00
2012-05-15 03:34:32 +08:00
/ * *
* Create an empty VCF header with no header lines and no samples
* /
public VCFHeader ( ) {
this ( Collections . < VCFHeaderLine > emptySet ( ) , Collections . < String > emptySet ( ) ) ;
}
2011-06-25 00:56:04 +08:00
/ * *
* create a VCF header , given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* /
public VCFHeader ( Set < VCFHeaderLine > metaData ) {
2012-05-10 05:55:42 +08:00
mMetaData . addAll ( metaData ) ;
2011-06-25 00:56:04 +08:00
loadVCFVersion ( ) ;
loadMetaDataMaps ( ) ;
}
2012-08-16 02:36:06 +08:00
/ * *
* Creates a shallow copy of the meta data in VCF header toCopy
*
* @param toCopy
* /
public VCFHeader ( final VCFHeader toCopy ) {
this ( toCopy . mMetaData ) ;
}
2011-06-25 00:56:04 +08:00
/ * *
* create a VCF header , given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
2011-09-22 10:23:28 +08:00
* @param genotypeSampleNames the sample names
2011-06-25 00:56:04 +08:00
* /
public VCFHeader ( Set < VCFHeaderLine > metaData , Set < String > genotypeSampleNames ) {
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
this ( metaData , new ArrayList < String > ( genotypeSampleNames ) ) ;
}
public VCFHeader ( Set < VCFHeaderLine > metaData , List < String > genotypeSampleNames ) {
2012-05-10 05:55:42 +08:00
this ( metaData ) ;
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
if ( genotypeSampleNames . size ( ) ! = new HashSet < String > ( genotypeSampleNames ) . size ( ) )
2012-12-19 03:56:48 +08:00
throw new TribbleException . InvalidHeader ( "BUG: VCF header has duplicate sample names" ) ;
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
2011-09-22 10:23:28 +08:00
mGenotypeSampleNames . addAll ( genotypeSampleNames ) ;
2011-06-25 00:56:04 +08:00
samplesWereAlreadySorted = ParsingUtils . isSorted ( genotypeSampleNames ) ;
2012-05-10 05:55:42 +08:00
buildVCFReaderMaps ( genotypeSampleNames ) ;
2011-06-25 00:56:04 +08:00
}
2011-11-19 01:39:10 +08:00
/ * *
* Tell this VCF header to use pre - calculated sample name ordering and the
* sample name - > offset map . This assumes that all VariantContext created
* using this header ( i . e . , read by the VCFCodec ) will have genotypes
* occurring in the same order
*
2012-05-10 05:55:42 +08:00
* @param genotypeSampleNamesInAppearenceOrder genotype sample names , must iterator in order of appearence
2011-11-19 01:39:10 +08:00
* /
2012-05-10 05:55:42 +08:00
private void buildVCFReaderMaps ( Collection < String > genotypeSampleNamesInAppearenceOrder ) {
2011-11-19 01:39:10 +08:00
sampleNamesInOrder = new ArrayList < String > ( genotypeSampleNamesInAppearenceOrder . size ( ) ) ;
sampleNameToOffset = new HashMap < String , Integer > ( genotypeSampleNamesInAppearenceOrder . size ( ) ) ;
int i = 0 ;
for ( final String name : genotypeSampleNamesInAppearenceOrder ) {
sampleNamesInOrder . add ( name ) ;
sampleNameToOffset . put ( name , i + + ) ;
}
Collections . sort ( sampleNamesInOrder ) ;
}
2011-06-25 00:56:04 +08:00
/ * *
* Adds a header line to the header metadata .
*
* @param headerLine Line to add to the existing metadata component .
* /
public void addMetaDataLine ( VCFHeaderLine headerLine ) {
2012-05-10 05:55:42 +08:00
mMetaData . add ( headerLine ) ;
2012-10-27 04:34:07 +08:00
loadMetaDataMaps ( ) ;
2012-05-10 05:55:42 +08:00
}
/ * *
2012-08-16 02:36:06 +08:00
* @return all of the VCF header lines of the # # contig form in order , or an empty list if none were present
2012-05-10 05:55:42 +08:00
* /
public List < VCFContigHeaderLine > getContigLines ( ) {
return Collections . unmodifiableList ( contigMetaData ) ;
2011-06-25 00:56:04 +08:00
}
2012-08-16 02:36:06 +08:00
/ * *
* @return all of the VCF FILTER lines in their original file order , or an empty list if none were present
* /
public List < VCFFilterHeaderLine > getFilterLines ( ) {
final List < VCFFilterHeaderLine > filters = new ArrayList < VCFFilterHeaderLine > ( ) ;
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFFilterHeaderLine ) {
filters . add ( ( VCFFilterHeaderLine ) line ) ;
}
}
return filters ;
}
/ * *
* @return all of the VCF FILTER lines in their original file order , or an empty list if none were present
* /
public List < VCFIDHeaderLine > getIDHeaderLines ( ) {
final List < VCFIDHeaderLine > filters = new ArrayList < VCFIDHeaderLine > ( ) ;
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFIDHeaderLine ) {
filters . add ( ( VCFIDHeaderLine ) line ) ;
}
}
return filters ;
}
2011-06-25 00:56:04 +08:00
/ * *
* check our metadata for a VCF version tag , and throw an exception if the version is out of date
* or the version is not present
* /
public void loadVCFVersion ( ) {
List < VCFHeaderLine > toRemove = new ArrayList < VCFHeaderLine > ( ) ;
for ( VCFHeaderLine line : mMetaData )
if ( VCFHeaderVersion . isFormatString ( line . getKey ( ) ) ) {
toRemove . add ( line ) ;
}
// remove old header lines for now,
mMetaData . removeAll ( toRemove ) ;
}
/ * *
* load the format / info meta data maps ( these are used for quick lookup by key name )
* /
private void loadMetaDataMaps ( ) {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = ( VCFInfoHeaderLine ) line ;
2012-06-14 09:49:22 +08:00
addMetaDataMapBinding ( mInfoMetaData , infoLine ) ;
2012-05-10 05:55:42 +08:00
} else if ( line instanceof VCFFormatHeaderLine ) {
2011-06-25 00:56:04 +08:00
VCFFormatHeaderLine formatLine = ( VCFFormatHeaderLine ) line ;
2012-06-14 09:49:22 +08:00
addMetaDataMapBinding ( mFormatMetaData , formatLine ) ;
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
} else if ( line instanceof VCFFilterHeaderLine ) {
VCFFilterHeaderLine filterLine = ( VCFFilterHeaderLine ) line ;
mFilterMetaData . put ( filterLine . getID ( ) , filterLine ) ;
2012-05-10 05:55:42 +08:00
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData . add ( ( VCFContigHeaderLine ) line ) ;
} else {
2011-09-10 04:10:30 +08:00
mOtherMetaData . put ( line . getKey ( ) , line ) ;
}
2011-06-25 00:56:04 +08:00
}
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
2012-06-21 03:35:36 +08:00
if ( hasFormatLine ( VCFConstants . GENOTYPE_LIKELIHOODS_KEY ) & & ! hasFormatLine ( VCFConstants . GENOTYPE_PL_KEY ) ) {
2013-01-09 03:45:50 +08:00
if ( GeneralUtils . DEBUG_MODE_ENABLED ) {
System . err . println ( "Found " + VCFConstants . GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+ VCFConstants . GENOTYPE_PL_KEY + " field. We now only manage PL fields internally"
+ " automatically adding a corresponding PL field to your VCF header" ) ;
}
2012-06-21 03:35:36 +08:00
addMetaDataLine ( new VCFFormatHeaderLine ( VCFConstants . GENOTYPE_PL_KEY , VCFHeaderLineCount . G , VCFHeaderLineType . Integer , "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification" ) ) ;
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
}
2011-06-25 00:56:04 +08:00
}
2012-06-14 09:49:22 +08:00
/ * *
* Add line to map , issuing warnings about duplicates
*
* @param map
* @param line
* @param < T >
* /
private final < T extends VCFCompoundHeaderLine > void addMetaDataMapBinding ( final Map < String , T > map , T line ) {
final String key = line . getID ( ) ;
2013-01-09 03:45:50 +08:00
if ( map . containsKey ( key ) ) {
if ( GeneralUtils . DEBUG_MODE_ENABLED ) {
System . err . println ( "Found duplicate VCF header lines for " + key + "; keeping the first only" ) ;
}
}
else {
2012-06-14 09:49:22 +08:00
map . put ( key , line ) ;
2013-01-09 03:45:50 +08:00
}
2012-06-14 09:49:22 +08:00
}
2011-06-25 00:56:04 +08:00
/ * *
* get the header fields in order they ' re presented in the input file ( which is now required to be
* the order presented in the spec ) .
*
* @return a set of the header fields , in order
* /
public Set < HEADER_FIELDS > getHeaderFields ( ) {
2012-04-17 23:45:32 +08:00
return new LinkedHashSet < HEADER_FIELDS > ( Arrays . asList ( HEADER_FIELDS . values ( ) ) ) ;
2011-06-25 00:56:04 +08:00
}
/ * *
2012-07-09 06:43:03 +08:00
* get the meta data , associated with this header , in sorted order
2011-06-25 00:56:04 +08:00
*
* @return a set of the meta data
* /
2012-07-09 06:43:03 +08:00
public Set < VCFHeaderLine > getMetaDataInInputOrder ( ) {
return makeGetMetaDataSet ( mMetaData ) ;
}
public Set < VCFHeaderLine > getMetaDataInSortedOrder ( ) {
return makeGetMetaDataSet ( new TreeSet < VCFHeaderLine > ( mMetaData ) ) ;
}
private static Set < VCFHeaderLine > makeGetMetaDataSet ( final Set < VCFHeaderLine > headerLinesInSomeOrder ) {
final Set < VCFHeaderLine > lines = new LinkedHashSet < VCFHeaderLine > ( ) ;
2012-05-31 23:19:53 +08:00
lines . add ( new VCFHeaderLine ( VCFHeaderVersion . VCF4_1 . getFormatString ( ) , VCFHeaderVersion . VCF4_1 . getVersionString ( ) ) ) ;
2012-07-09 06:43:03 +08:00
lines . addAll ( headerLinesInSomeOrder ) ;
2011-06-25 00:56:04 +08:00
return Collections . unmodifiableSet ( lines ) ;
}
2012-05-22 04:38:28 +08:00
/ * *
* Get the VCFHeaderLine whose key equals key . Returns null if no such line exists
* @param key
* @return
* /
public VCFHeaderLine getMetaDataLine ( final String key ) {
2012-07-09 06:43:03 +08:00
for ( final VCFHeaderLine line : mMetaData ) {
2012-05-22 04:38:28 +08:00
if ( line . getKey ( ) . equals ( key ) )
return line ;
}
return null ;
}
2011-06-25 00:56:04 +08:00
/ * *
* get the genotyping sample names
*
* @return a list of the genotype column names , which may be empty if hasGenotypingData ( ) returns false
* /
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
public List < String > getGenotypeSamples ( ) {
2011-06-25 00:56:04 +08:00
return mGenotypeSampleNames ;
}
2012-06-14 09:49:22 +08:00
public int getNGenotypeSamples ( ) {
return mGenotypeSampleNames . size ( ) ;
}
2011-06-25 00:56:04 +08:00
/ * *
* do we have genotyping data ?
*
* @return true if we have genotyping columns , false otherwise
* /
public boolean hasGenotypingData ( ) {
2012-06-14 09:49:22 +08:00
return getNGenotypeSamples ( ) > 0 ;
2011-06-25 00:56:04 +08:00
}
/ * *
* were the input samples sorted originally ?
*
* @return true if the input samples were sorted originally , false otherwise
* /
public boolean samplesWereAlreadySorted ( ) {
return samplesWereAlreadySorted ;
}
/** @return the column count */
public int getColumnCount ( ) {
2011-09-22 10:23:28 +08:00
return HEADER_FIELDS . values ( ) . length + ( hasGenotypingData ( ) ? mGenotypeSampleNames . size ( ) + 1 : 0 ) ;
2011-06-25 00:56:04 +08:00
}
2012-08-16 02:36:06 +08:00
/ * *
* Returns the INFO HeaderLines in their original ordering
* /
2012-06-14 09:49:22 +08:00
public Collection < VCFInfoHeaderLine > getInfoHeaderLines ( ) {
return mInfoMetaData . values ( ) ;
}
2012-08-16 02:36:06 +08:00
/ * *
* Returns the FORMAT HeaderLines in their original ordering
* /
2012-06-14 09:49:22 +08:00
public Collection < VCFFormatHeaderLine > getFormatHeaderLines ( ) {
return mFormatMetaData . values ( ) ;
}
2011-06-25 00:56:04 +08:00
/ * *
2012-05-23 04:27:13 +08:00
* @param id the header key name
2011-06-25 00:56:04 +08:00
* @return the meta data line , or null if there is none
* /
2012-05-23 04:27:13 +08:00
public VCFInfoHeaderLine getInfoHeaderLine ( String id ) {
return mInfoMetaData . get ( id ) ;
2011-06-25 00:56:04 +08:00
}
/ * *
2012-05-23 04:27:13 +08:00
* @param id the header key name
2011-06-25 00:56:04 +08:00
* @return the meta data line , or null if there is none
* /
2012-05-23 04:27:13 +08:00
public VCFFormatHeaderLine getFormatHeaderLine ( String id ) {
return mFormatMetaData . get ( id ) ;
2011-06-25 00:56:04 +08:00
}
2011-09-10 04:10:30 +08:00
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
/ * *
* @param id the header key name
* @return the meta data line , or null if there is none
* /
public VCFFilterHeaderLine getFilterHeaderLine ( final String id ) {
return mFilterMetaData . get ( id ) ;
}
public boolean hasInfoLine ( final String id ) {
return getInfoHeaderLine ( id ) ! = null ;
}
public boolean hasFormatLine ( final String id ) {
return getFormatHeaderLine ( id ) ! = null ;
}
public boolean hasFilterLine ( final String id ) {
return getFilterHeaderLine ( id ) ! = null ;
}
2011-09-10 04:10:30 +08:00
/ * *
* @param key the header key name
* @return the meta data line , or null if there is none
* /
public VCFHeaderLine getOtherHeaderLine ( String key ) {
return mOtherMetaData . get ( key ) ;
}
2011-06-25 00:56:04 +08:00
2012-04-17 23:45:32 +08:00
/ * *
* If true additional engine headers will be written to the VCF , otherwise only the walker headers will be output .
* @return true if additional engine headers will be written to the VCF
* /
public boolean isWriteEngineHeaders ( ) {
return writeEngineHeaders ;
}
2011-06-25 00:56:04 +08:00
2012-04-17 23:45:32 +08:00
/ * *
* If true additional engine headers will be written to the VCF , otherwise only the walker headers will be output .
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
* /
public void setWriteEngineHeaders ( boolean writeEngineHeaders ) {
this . writeEngineHeaders = writeEngineHeaders ;
}
/ * *
* If true , and isWriteEngineHeaders also returns true , the command line will be written to the VCF .
* @return true if the command line will be written to the VCF
* /
public boolean isWriteCommandLine ( ) {
return writeCommandLine ;
}
2011-06-25 00:56:04 +08:00
2012-04-17 23:45:32 +08:00
/ * *
* If true , and isWriteEngineHeaders also returns true , the command line will be written to the VCF .
* @param writeCommandLine true if the command line will be written to the VCF
* /
public void setWriteCommandLine ( boolean writeCommandLine ) {
this . writeCommandLine = writeCommandLine ;
}
2012-06-01 01:47:31 +08:00
public ArrayList < String > getSampleNamesInOrder ( ) {
return sampleNamesInOrder ;
}
public HashMap < String , Integer > getSampleNameToOffset ( ) {
return sampleNameToOffset ;
}
2012-08-16 02:36:06 +08:00
@Override
public String toString ( ) {
final StringBuilder b = new StringBuilder ( ) ;
b . append ( "[VCFHeader:" ) ;
for ( final VCFHeaderLine line : mMetaData )
b . append ( "\n\t" ) . append ( line ) ;
return b . append ( "\n]" ) . toString ( ) ;
}
2012-04-17 23:45:32 +08:00
}