2010-04-20 07:00:08 +08:00
/ *
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2010-04-20 07:00:08 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2010-04-20 07:00:08 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2010-03-27 02:55:12 +08:00
package org.broadinstitute.sting.gatk.walkers ;
2009-09-08 14:12:18 +08:00
2010-05-03 14:02:35 +08:00
import org.broad.tribble.dbsnp.DbSNPFeature ;
2010-07-11 15:19:16 +08:00
import org.broad.tribble.vcf.* ;
2009-09-08 14:12:18 +08:00
import org.broadinstitute.sting.gatk.contexts.AlignmentContext ;
2009-09-08 21:13:55 +08:00
import org.broadinstitute.sting.gatk.contexts.ReferenceContext ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext ;
2010-07-14 12:56:58 +08:00
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils ;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype ;
2010-03-25 12:53:31 +08:00
import org.broadinstitute.sting.gatk.refdata.* ;
2010-04-20 07:00:08 +08:00
import org.broadinstitute.sting.commandline.Argument ;
2010-05-03 14:02:35 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.utils.genotype.vcf.* ;
2010-04-11 10:59:11 +08:00
import org.broadinstitute.sting.utils.BaseUtils ;
2010-07-14 12:56:58 +08:00
import org.broadinstitute.sting.utils.SampleUtils ;
2009-09-08 14:12:18 +08:00
2009-09-08 21:13:55 +08:00
import java.util.* ;
2009-09-08 14:12:18 +08:00
2009-12-13 05:41:07 +08:00
/ * *
2010-03-25 12:53:31 +08:00
* Converts variants from other file formats to VCF format .
2009-12-13 05:41:07 +08:00
* /
2010-03-27 02:34:59 +08:00
@Requires ( value = { } , referenceMetaData = @RMD ( name = VariantsToVCF . INPUT_ROD_NAME , type = ReferenceOrderedDatum . class ) )
2010-04-19 13:47:17 +08:00
@Reference ( window = @Window ( start = 0 , stop = 40 ) )
2010-03-25 12:53:31 +08:00
public class VariantsToVCF extends RodWalker < Integer , Integer > {
2009-09-08 14:12:18 +08:00
2010-03-27 02:34:59 +08:00
public static final String INPUT_ROD_NAME = "variant" ;
2010-03-25 12:53:31 +08:00
@Argument ( fullName = "sample" , shortName = "sample" , doc = "The sample name represented by the variant rod (for data like GELI with genotypes)" , required = false )
protected String sampleName = null ;
2009-09-08 14:12:18 +08:00
2010-03-25 12:53:31 +08:00
private VCFWriter vcfwriter = null ;
2009-09-08 14:12:18 +08:00
2010-07-14 12:56:58 +08:00
private Set < String > allowedGenotypeFormatStrings = new HashSet < String > ( ) ;
2010-03-25 12:53:31 +08:00
// Don't allow mixed types for now
2010-07-16 23:50:25 +08:00
private EnumSet < VariantContext . Type > ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet . of ( VariantContext . Type . SNP ,
VariantContext . Type . NO_VARIATION , VariantContext . Type . INDEL , VariantContext . Type . MNP ) ;
2009-09-08 14:12:18 +08:00
public Integer map ( RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
2010-04-11 10:59:11 +08:00
if ( tracker = = null | | ! BaseUtils . isRegularBase ( ref . getBase ( ) ) )
2010-03-25 12:53:31 +08:00
return 0 ;
2010-05-03 14:02:35 +08:00
DbSNPFeature dbsnp = DbSNPHelper . getFirstRealSNP ( tracker . getReferenceMetaData ( DbSNPHelper . STANDARD_DBSNP_TRACK_NAME ) ) ;
2010-03-25 12:53:31 +08:00
2010-04-19 13:47:17 +08:00
Collection < VariantContext > contexts = tracker . getVariantContexts ( ref , INPUT_ROD_NAME , ALLOWED_VARIANT_CONTEXT_TYPES , context . getLocation ( ) , true , false ) ;
2010-03-27 02:34:59 +08:00
2010-03-25 12:53:31 +08:00
for ( VariantContext vc : contexts ) {
2010-07-14 12:56:58 +08:00
Map < String , Object > attrs = new HashMap < String , Object > ( vc . getAttributes ( ) ) ;
2010-03-25 12:53:31 +08:00
if ( dbsnp ! = null )
2010-07-21 02:01:45 +08:00
attrs . put ( VariantContext . ID_KEY , dbsnp . getRsID ( ) ) ;
2010-07-14 12:56:58 +08:00
vc = VariantContextUtils . modifyAttributes ( vc , attrs ) ;
2010-03-27 02:34:59 +08:00
// set the appropriate sample name if necessary
2010-07-14 12:56:58 +08:00
if ( sampleName ! = null & & vc . hasGenotypes ( ) & & vc . hasGenotype ( INPUT_ROD_NAME ) ) {
Genotype g = VariantContextUtils . modifyName ( vc . getGenotype ( INPUT_ROD_NAME ) , sampleName ) ;
Map < String , Genotype > genotypes = new HashMap < String , Genotype > ( ) ;
genotypes . put ( sampleName , g ) ;
vc = VariantContextUtils . modifyGenotypes ( vc , genotypes ) ;
}
writeRecord ( vc , tracker , ref . getBase ( ) ) ;
2009-09-08 14:12:18 +08:00
}
2010-03-25 12:53:31 +08:00
return 1 ;
}
2009-09-08 14:12:18 +08:00
2010-07-14 12:56:58 +08:00
private void writeRecord ( VariantContext vc , RefMetaDataTracker tracker , byte ref ) {
2010-03-25 12:53:31 +08:00
if ( vcfwriter = = null ) {
// setup the header fields
Set < VCFHeaderLine > hInfo = new HashSet < VCFHeaderLine > ( ) ;
hInfo . addAll ( VCFUtils . getHeaderFields ( getToolkit ( ) ) ) ;
2010-03-27 02:34:59 +08:00
hInfo . add ( new VCFHeaderLine ( "source" , "VariantsToVCF" ) ) ;
2010-03-25 12:53:31 +08:00
hInfo . add ( new VCFHeaderLine ( "reference" , getToolkit ( ) . getArguments ( ) . referenceFile . getName ( ) ) ) ;
2009-09-08 14:12:18 +08:00
2010-07-14 12:56:58 +08:00
allowedGenotypeFormatStrings . add ( VCFConstants . GENOTYPE_KEY ) ;
for ( VCFHeaderLine field : hInfo ) {
if ( field instanceof VCFFormatHeaderLine ) {
allowedGenotypeFormatStrings . add ( ( ( VCFFormatHeaderLine ) field ) . getName ( ) ) ;
}
}
Set < String > samples = new TreeSet < String > ( ) ;
2010-03-27 02:34:59 +08:00
if ( sampleName ! = null ) {
2010-03-25 12:53:31 +08:00
samples . add ( sampleName ) ;
2010-03-27 02:34:59 +08:00
} else {
2010-07-14 12:56:58 +08:00
// try VCF first
samples = SampleUtils . getSampleListWithVCFHeader ( getToolkit ( ) , Arrays . asList ( INPUT_ROD_NAME ) ) ;
if ( samples . isEmpty ( ) ) {
List < Object > rods = tracker . getReferenceMetaData ( INPUT_ROD_NAME ) ;
if ( rods . size ( ) = = 0 )
throw new IllegalStateException ( "No rod data is present" ) ;
Object rod = rods . get ( 0 ) ;
if ( rod instanceof HapMapROD )
samples . addAll ( Arrays . asList ( ( ( HapMapROD ) rod ) . getSampleIDs ( ) ) ) ;
else
samples . addAll ( vc . getSampleNames ( ) ) ;
}
2010-03-27 02:34:59 +08:00
}
2009-09-10 04:04:32 +08:00
2010-03-27 02:34:59 +08:00
vcfwriter = new VCFWriter ( out ) ;
vcfwriter . writeHeader ( new VCFHeader ( hInfo , samples ) ) ;
2009-09-08 14:12:18 +08:00
}
2010-07-14 12:56:58 +08:00
vc = VariantContextUtils . purgeUnallowedGenotypeAttributes ( vc , allowedGenotypeFormatStrings ) ;
vcfwriter . add ( vc , new byte [ ] { ref } ) ;
2009-09-08 14:12:18 +08:00
}
public Integer reduceInit ( ) {
return 0 ;
}
public Integer reduce ( Integer value , Integer sum ) {
return value + sum ;
}
public void onTraversalDone ( Integer sum ) {
2010-03-27 02:34:59 +08:00
if ( vcfwriter ! = null )
vcfwriter . close ( ) ;
2009-09-08 14:12:18 +08:00
}
}