diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 0328cd2e4..0e76489b7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -123,7 +123,7 @@ public class VariantAnnotator extends LocusWalker { */ public Integer reduceInit() { return 0; } - + /** * We want reads that span deletions * @@ -154,16 +154,20 @@ public class VariantAnnotator extends LocusWalker { return 0; // if the reference base is not ambiguous, we can annotate + Collection annotatedVCs = Arrays.asList( new VariantContext[] { vc } ); if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); if ( stratifiedContexts != null ) { - vc = engine.annotateContext(tracker, ref, stratifiedContexts, vc); + annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc); + } + } + + if ( variant instanceof RodVCF ) { + for(VariantContext annotatedVC : annotatedVCs ) { + vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase())); } } - if ( variant instanceof RodVCF ) - vcfWriter.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase())); - return 1; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index d6cbab097..0521dd228 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.playground.gatk.walkers.annotator.GenomicAnnotation; import org.broadinstitute.sting.utils.PackageUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine; @@ -33,6 +34,12 @@ public class VariantAnnotatorEngine { // how about hapmap3? private boolean annotateHapmap3 = false; + // command-line option used for GenomicAnnotation. + private Map> requestedColumnsMap; + + // command-line option used for GenomicAnnotation. + private boolean explode; + // use this constructor if you want all possible annotations public VariantAnnotatorEngine(GenomeAnalysisEngine engine) { @@ -143,7 +150,7 @@ public class VariantAnnotatorEngine { return descriptions; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Collection annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new HashMap(vc.getAttributes()); @@ -166,12 +173,50 @@ public class VariantAnnotatorEngine { infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1"); } - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) { - Map result = annotation.annotate(tracker, ref, stratifiedContexts, vc); - if ( result != null ) - infoAnnotations.putAll(result); + + //Process the info field + List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file + infoAnnotationOutputsList.add(new HashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file gains 1 line. + + //go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) + { + Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); + if ( annotationsFromCurrentType == null ) { + continue; + } + + if(annotationType instanceof GenomicAnnotation) + { + //go through the annotations returned by GenericAnnotation for each -B input file. + for( Map.Entry annotationsFromInputFile : annotationsFromCurrentType.entrySet() ) + { + final String inputFileBindingName = annotationsFromInputFile.getKey(); + final List> matchingRecords = (List>) annotationsFromInputFile.getValue(); + + if( matchingRecords.size() > 1 && explode) + { + //More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size(). + infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputFileBindingName); + } + else + { + //This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will + //all be added to the same output line, with keys disambiguated by appending _i . + addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputFileBindingName); + } + } + } + else + { + //add the annotations to each output line. + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + infoAnnotationOutput.putAll(annotationsFromCurrentType); + } + } } + //Process genotypes Map genotypes; if ( requestedGenotypeAnnotations.size() == 0 ) { genotypes = vc.getGenotypes(); @@ -195,6 +240,161 @@ public class VariantAnnotatorEngine { } } - return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotations); + //Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList + Collection returnValue = new LinkedList(); + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + returnValue.add( new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotationOutput) ); + } + + return returnValue; } -} \ No newline at end of file + + + /** + * Implements non-explode mode, where the output lines have a one-to-one relationship + * with the input variants, and all multiple-match records are collapsed into the single info field. + * The collapsing is done by appending an _i to each key name (where 'i' is a record counter). + * + * @param infoAnnotationOutputsList + * @param matchingRecords + * @param bindingName + */ + private void addToExistingAnnotationOutputs( + final List> infoAnnotationOutputsList, + final List> matchingRecords, + final String bindingName) { + //For each matching record, just add its annotations to all existing output lines. + final boolean renameKeys = matchingRecords.size() > 1; + for(int i = 0; i < matchingRecords.size(); i++) { + Map annotationsForRecord = matchingRecords.get(i); + annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested. + + if(renameKeys) { + //Rename keys to avoid naming conflicts (eg. if you have multiple dbsnp matches, + // dbSNP.avHet=value1 from record 1 and dbSNP.avHet=value2 from record 2 will become dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 ) + Map annotationsForRecordWithRenamedKeys = new HashMap(); + for(Map.Entry annotation : annotationsForRecord.entrySet()) { + annotationsForRecordWithRenamedKeys.put(annotation.getKey() + "_" + i, annotation.getValue()); + } + + annotationsForRecord = annotationsForRecordWithRenamedKeys; + } + + //Add the annotations from this record to each output line. + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + infoAnnotationOutput.putAll(annotationsForRecord); + } + } + } + + /** + * Implements "explode" mode. Takes the current list of + * infoAnnotationOutputs (each element of will end up in a different line + * of the output VCF file), and generates/returns a new list of infoAnnotationOutputs + * which contain one copy of the current infoAnnotationOutputs for each record + * in matching records. The returned list will have size: + * + * infoAnnotationOutputsList.size() * matchingRecords.size() + * + * See class-level comments for more details. + * + * @param infoAnnotationOutputsList + * @param matchingRecords + * @param bindingName + * @return + */ + private List> explodeInfoAnnotationOutputsList( + final List> infoAnnotationOutputsList, + final List> matchingRecords, + final String bindingName) { + + + //This is the return value. It represents the new list of lines in the output VCF file. + final List> newInfoAnnotationOutputsList = new LinkedList>(); + + //For each matching record, generate a new output line + for(int i = 0; i < matchingRecords.size(); i++) { + Map annotationsForRecord = matchingRecords.get(i); + annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested. + + //Add the annotations from this record to each output line. + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + Map infoAnnotationOutputCopy = new HashMap(infoAnnotationOutput); //create a new copy of this line. + infoAnnotationOutputCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line. + + newInfoAnnotationOutputsList.add(infoAnnotationOutputCopy); //Add the line to the new list of lines. + } + } + + return newInfoAnnotationOutputsList; + } + + + + /** + * Takes a list of key-value pairs and returns a new Map containing only the columns which were requested by the user + * via the -s arg. If there was no -s arg that referenced the given bindingName, all annotationsForRecord returned untouched. + * + * @param bindingName The binding name for a particular ROD input file. + * @param annotationsForRecord The list of column_name -> value pairs for a particular record from the given input file. + * + * @return Map - see above. + */ + private Map selectColumnsFromRecord( String bindingName, Map annotationsForRecord) { + if(requestedColumnsMap == null || !requestedColumnsMap.containsKey(bindingName)) { + return annotationsForRecord; + } + + Set requestedColumns = requestedColumnsMap.get(bindingName); + Map subsettedAnnotations = new HashMap(); + for(Map.Entry e : annotationsForRecord.entrySet() ) { + if(requestedColumns.contains(e.getKey())) { + subsettedAnnotations.put(e.getKey(), e.getValue()); + } + } + + if(subsettedAnnotations.isEmpty()) { + throw new StingException("Invalid -s argument for the '" + bindingName + "' input file. " + + "It caused all columns in the file to be rejected. Please check to make sure the -s column " + + "names match the column names in the '" + bindingName + "' file's HEADER line."); + } + + return subsettedAnnotations; + } + + + + /** + * Determines how the engine will handle the case where multiple records in a ROD file + * overlap a particular single locus. If explode is set to true, the output will be + * one-to-many, so that each locus in the input VCF file could result in multiple + * entries in the output VCF file. Otherwise, the output will be one-to-one, and + * all multiple-match records will be collapsed into the single info field. + * The collapsing is done by appending an _i to each key name (where 'i' is a + * record counter). + * + * See class-level comments for more details. + * + * @param explode + */ + public void setExplode(boolean explode) { + this.explode = explode; + } + + /** + * Sets the columns that will be used for the info annotation field. + * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet). + * + * @param columns An array of strings where each string is a comma-separated list + * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ). + */ + public void setRequestedColumns(String[] columns) { + if(columns == null) { + throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg."); + } + + //System.err.println("COLUMNS: "+Arrays.asList(columns).toString()); + + this.requestedColumnsMap = GenomicAnnotation.parseColumnsArg(columns); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 1e5cc7f7b..b475608be 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; @@ -224,7 +225,8 @@ public class UnifiedGenotyperEngine { // first off, we want to use the *unfiltered* context for the annotations stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup()); - call.vc = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc); + Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc); + call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element. } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java new file mode 100644 index 000000000..f716c8a4d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java @@ -0,0 +1,163 @@ +package org.broadinstitute.sting.playground.gatk.walkers.annotator; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.TabularROD; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; + +/** + * TODO comment + */ +public class GenomicAnnotation implements InfoFieldAnnotation { + + + /** + * For each ROD (aka. record) which overlaps the current locus, generates a + * set of annotations of the form: + * + * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A), + * + * These annotations are stored in a Map. + * + * Since a single input file can have multiple records that overlap the current + * locus (eg. dbSNP can have multiple entries for the same location), a different + * Map is created for each of these, resulting in a List> + * for each input file. + * + * The return value of this method is a Map of the form: + * rodName1 -> List> + * rodName2 -> List> + * rodName3 -> List> + * ... + * + * The List values are guaranteed to have size > 0, and in most cases will have size == 1. + */ + public Map annotate(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + + final Map annotations = new HashMap(); + for(final GATKFeature gatkFeature : tracker.getAllRods()) + { + final ReferenceOrderedDatum rod = (ReferenceOrderedDatum) gatkFeature.getUnderlyingObject(); + if(! (rod instanceof TabularROD) ) { + continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns + } + + final String name = rod.getName(); + if(name.equals("variant") || name.equals("interval")) { + continue; + } + + + final Map annotationsForRecord = convertRecordToAnnotations( (TabularROD) rod ); + + List> listOfMatchingRecords = (List>) annotations.get(name); + if(listOfMatchingRecords == null) { + listOfMatchingRecords = new LinkedList>(); + listOfMatchingRecords.add( annotationsForRecord ); + annotations.put(name, listOfMatchingRecords); + } else { + listOfMatchingRecords.add( annotationsForRecord ); + } + } + + return annotations; + } + + + + + /** + * Converts the ROD to a set of key-value pairs of the form: + * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue + * (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A) + * + * @param rod A TabularROD corresponding to one record in one input file. + * + * @return The map of column-name -> value pairs. + */ + private Map convertRecordToAnnotations( final TabularROD rod ) { + final String rodName = rod.getName(); //aka the rod binding + + final Map result = new HashMap(); + for(final Entry entry : rod.entrySet()) { + result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue()); + } + + return result; + } + + public static String generateInfoFieldKey(String rodBindingName, String columnName ) { + return rodBindingName + "." + columnName; + } + + + /** + * Parses the columns arg and returns a Map of columns hashed by their binding name. + * For example: + * The command line: + * -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd + * + * will be passed to this method as: + * ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"] + * + * resulting in a return value of: + * { + * "dbSnp" -> "dbSnp.valid" , + * "dbSnp" -> "dbsnp.avHet" , + * "refGene" -> "refGene.txStart", + * "refGene" -> "refGene.txEnd" + * } + * + * @param columnsArg The -s command line arg value. + * + * @return Map representing a parsed version of this arg - see above. + */ + public static Map> parseColumnsArg(String[] columnsArg) { + Map> result = new HashMap>(); + + for(String s : columnsArg) { + for(String columnSpecifier : s.split(",") ) { + String[] rodNameColumnName = columnSpecifier.split("\\."); + if(rodNameColumnName.length != 2) { + throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'."); + } + String rodName = rodNameColumnName[0]; + //String columnName = rodNameColumnName[1]; + + Set requestedColumns = result.get(rodName); + if(requestedColumns == null) { + requestedColumns = new HashSet(); + result.put(rodName, requestedColumns); + } + requestedColumns.add(columnSpecifier); + } + } + + return result; + } + + public VCFInfoHeaderLine getDescription() { + return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. "); + } + + public String getKeyName() { + return "GenericAnnotation"; + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java new file mode 100644 index 000000000..00326d8a8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java @@ -0,0 +1,174 @@ + package org.broadinstitute.sting.playground.gatk.walkers.annotator; + +import java.io.File; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.walkers.Allows; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; + + +/** + * Annotates variant calls with context information. Users can specify which of the available annotations to use. + */ +//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class)) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) +@Reference(window=@Window(start=-50,stop=50)) +@By(DataSource.REFERENCE) +public class GenomicAnnotator extends RodWalker { + @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) + protected File VCF_OUT; + @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) + protected String sampleName = null; + + @Argument(fullName="select", shortName="s", doc="Select which columns to use for each ROD file. Column #s are 0-based. (eg. The following will select columns 5,6,2 from file1.txt and columns 3,7 from file2.txt: -B my-rod,table,/path/file1.txt -B my-rod2,table,/path/file2.txt -S my-rod={5,6,2} -S my-rod2={3,7})", required=false) + protected String[] COLUMNS = {}; + + @Argument(fullName="explode", shortName="exp", doc="If more than one record from the same file matches a particular locus, create multiple entries in the ouptut file - one for each match. WARNING: This could lead to combinatorial explotion if more than one file have more than one match at a particular locus.", required=false) + protected Boolean EXPLODE = false; + + private VCFWriter vcfWriter; + + private HashMap nonVCFsampleName = new HashMap(); + + private VariantAnnotatorEngine engine; + + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + + // get the list of all sample names from the various VCF input rods + TreeSet samples = new TreeSet(); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + + // add the non-VCF sample from the command-line, if applicable + if ( sampleName != null ) { + nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); + samples.add(sampleName.toUpperCase()); + } + + // if there are no valid samples, warn the user + if ( samples.size() == 0 ) { + logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); + } + + engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" }); + + engine.setExplode( Boolean.TRUE.equals( EXPLODE ) ); + engine.setRequestedColumns(COLUMNS); + + // setup the header fields + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "Annotator")); + hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + hInfo.addAll(engine.getVCFAnnotationDescriptions()); + + vcfWriter = new VCFWriter(VCF_OUT); + VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + + /** + * We want reads that span deletions + * + * @return true + */ + public boolean includeReadsWithDeletionAtLoci() { return true; } + + /** + * For each site of interest, annotate based on the requested annotation types + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + + List rods = tracker.getReferenceMetaData("variant"); + // ignore places where we don't have a variant + if ( rods.size() == 0 ) + return 0; + + Object variant = rods.get(0); + VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant); + if ( vc == null ) + return 0; + + // if the reference base is not ambiguous, we can annotate + Collection annotatedVCs = null; + if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { + Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); + if ( stratifiedContexts != null ) { + annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc); + } + } + + for(VariantContext annotatedVC : annotatedVCs) { + vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase())); + } + + return 1; + } + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); + + vcfWriter.close(); + } +} +