First checkin of GenomicAnnotator which annotates an input VCF file by pulling data in a generic way from an arbitrary set of TabularRODs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3114 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
699a0ea9d1
commit
6b7b07f178
|
|
@ -123,7 +123,7 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
|
||||||
*/
|
*/
|
||||||
public Integer reduceInit() { return 0; }
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* We want reads that span deletions
|
* We want reads that span deletions
|
||||||
*
|
*
|
||||||
|
|
@ -154,16 +154,20 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// if the reference base is not ambiguous, we can annotate
|
// if the reference base is not ambiguous, we can annotate
|
||||||
|
Collection<VariantContext> annotatedVCs = Arrays.asList( new VariantContext[] { vc } );
|
||||||
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
||||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
||||||
if ( stratifiedContexts != null ) {
|
if ( stratifiedContexts != null ) {
|
||||||
vc = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( variant instanceof RodVCF ) {
|
||||||
|
for(VariantContext annotatedVC : annotatedVCs ) {
|
||||||
|
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( variant instanceof RodVCF )
|
|
||||||
vcfWriter.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||||
|
import org.broadinstitute.sting.playground.gatk.walkers.annotator.GenomicAnnotation;
|
||||||
import org.broadinstitute.sting.utils.PackageUtils;
|
import org.broadinstitute.sting.utils.PackageUtils;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||||
|
|
@ -33,6 +34,12 @@ public class VariantAnnotatorEngine {
|
||||||
// how about hapmap3?
|
// how about hapmap3?
|
||||||
private boolean annotateHapmap3 = false;
|
private boolean annotateHapmap3 = false;
|
||||||
|
|
||||||
|
// command-line option used for GenomicAnnotation.
|
||||||
|
private Map<String, Set<String>> requestedColumnsMap;
|
||||||
|
|
||||||
|
// command-line option used for GenomicAnnotation.
|
||||||
|
private boolean explode;
|
||||||
|
|
||||||
|
|
||||||
// use this constructor if you want all possible annotations
|
// use this constructor if you want all possible annotations
|
||||||
public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
|
public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
|
||||||
|
|
@ -143,7 +150,7 @@ public class VariantAnnotatorEngine {
|
||||||
return descriptions;
|
return descriptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Collection<VariantContext> annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
||||||
Map<String, Object> infoAnnotations = new HashMap<String, Object>(vc.getAttributes());
|
Map<String, Object> infoAnnotations = new HashMap<String, Object>(vc.getAttributes());
|
||||||
|
|
||||||
|
|
@ -166,12 +173,50 @@ public class VariantAnnotatorEngine {
|
||||||
infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1");
|
infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
|
|
||||||
Map<String, Object> result = annotation.annotate(tracker, ref, stratifiedContexts, vc);
|
//Process the info field
|
||||||
if ( result != null )
|
List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
|
||||||
infoAnnotations.putAll(result);
|
infoAnnotationOutputsList.add(new HashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file gains 1 line.
|
||||||
|
|
||||||
|
//go through all the requested info annotationTypes
|
||||||
|
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
|
||||||
|
{
|
||||||
|
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
|
||||||
|
if ( annotationsFromCurrentType == null ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(annotationType instanceof GenomicAnnotation)
|
||||||
|
{
|
||||||
|
//go through the annotations returned by GenericAnnotation for each -B input file.
|
||||||
|
for( Map.Entry<String, Object> annotationsFromInputFile : annotationsFromCurrentType.entrySet() )
|
||||||
|
{
|
||||||
|
final String inputFileBindingName = annotationsFromInputFile.getKey();
|
||||||
|
final List<Map<String, String>> matchingRecords = (List<Map<String, String>>) annotationsFromInputFile.getValue();
|
||||||
|
|
||||||
|
if( matchingRecords.size() > 1 && explode)
|
||||||
|
{
|
||||||
|
//More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size().
|
||||||
|
infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will
|
||||||
|
//all be added to the same output line, with keys disambiguated by appending _i .
|
||||||
|
addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//add the annotations to each output line.
|
||||||
|
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||||
|
infoAnnotationOutput.putAll(annotationsFromCurrentType);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Process genotypes
|
||||||
Map<String, Genotype> genotypes;
|
Map<String, Genotype> genotypes;
|
||||||
if ( requestedGenotypeAnnotations.size() == 0 ) {
|
if ( requestedGenotypeAnnotations.size() == 0 ) {
|
||||||
genotypes = vc.getGenotypes();
|
genotypes = vc.getGenotypes();
|
||||||
|
|
@ -195,6 +240,161 @@ public class VariantAnnotatorEngine {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotations);
|
//Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
|
||||||
|
Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
|
||||||
|
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||||
|
returnValue.add( new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotationOutput) );
|
||||||
|
}
|
||||||
|
|
||||||
|
return returnValue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements non-explode mode, where the output lines have a one-to-one relationship
|
||||||
|
* with the input variants, and all multiple-match records are collapsed into the single info field.
|
||||||
|
* The collapsing is done by appending an _i to each key name (where 'i' is a record counter).
|
||||||
|
*
|
||||||
|
* @param infoAnnotationOutputsList
|
||||||
|
* @param matchingRecords
|
||||||
|
* @param bindingName
|
||||||
|
*/
|
||||||
|
private void addToExistingAnnotationOutputs(
|
||||||
|
final List<Map<String, Object>> infoAnnotationOutputsList,
|
||||||
|
final List<Map<String, String>> matchingRecords,
|
||||||
|
final String bindingName) {
|
||||||
|
//For each matching record, just add its annotations to all existing output lines.
|
||||||
|
final boolean renameKeys = matchingRecords.size() > 1;
|
||||||
|
for(int i = 0; i < matchingRecords.size(); i++) {
|
||||||
|
Map<String,String> annotationsForRecord = matchingRecords.get(i);
|
||||||
|
annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
|
||||||
|
|
||||||
|
if(renameKeys) {
|
||||||
|
//Rename keys to avoid naming conflicts (eg. if you have multiple dbsnp matches,
|
||||||
|
// dbSNP.avHet=value1 from record 1 and dbSNP.avHet=value2 from record 2 will become dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 )
|
||||||
|
Map<String,String> annotationsForRecordWithRenamedKeys = new HashMap<String, String>();
|
||||||
|
for(Map.Entry<String, String> annotation : annotationsForRecord.entrySet()) {
|
||||||
|
annotationsForRecordWithRenamedKeys.put(annotation.getKey() + "_" + i, annotation.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
annotationsForRecord = annotationsForRecordWithRenamedKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Add the annotations from this record to each output line.
|
||||||
|
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||||
|
infoAnnotationOutput.putAll(annotationsForRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements "explode" mode. Takes the current list of
|
||||||
|
* infoAnnotationOutputs (each element of will end up in a different line
|
||||||
|
* of the output VCF file), and generates/returns a new list of infoAnnotationOutputs
|
||||||
|
* which contain one copy of the current infoAnnotationOutputs for each record
|
||||||
|
* in matching records. The returned list will have size:
|
||||||
|
*
|
||||||
|
* infoAnnotationOutputsList.size() * matchingRecords.size()
|
||||||
|
*
|
||||||
|
* See class-level comments for more details.
|
||||||
|
*
|
||||||
|
* @param infoAnnotationOutputsList
|
||||||
|
* @param matchingRecords
|
||||||
|
* @param bindingName
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private List<Map<String, Object>> explodeInfoAnnotationOutputsList(
|
||||||
|
final List<Map<String, Object>> infoAnnotationOutputsList,
|
||||||
|
final List<Map<String, String>> matchingRecords,
|
||||||
|
final String bindingName) {
|
||||||
|
|
||||||
|
|
||||||
|
//This is the return value. It represents the new list of lines in the output VCF file.
|
||||||
|
final List<Map<String, Object>> newInfoAnnotationOutputsList = new LinkedList<Map<String, Object>>();
|
||||||
|
|
||||||
|
//For each matching record, generate a new output line
|
||||||
|
for(int i = 0; i < matchingRecords.size(); i++) {
|
||||||
|
Map<String,String> annotationsForRecord = matchingRecords.get(i);
|
||||||
|
annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
|
||||||
|
|
||||||
|
//Add the annotations from this record to each output line.
|
||||||
|
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||||
|
Map<String, Object> infoAnnotationOutputCopy = new HashMap<String, Object>(infoAnnotationOutput); //create a new copy of this line.
|
||||||
|
infoAnnotationOutputCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line.
|
||||||
|
|
||||||
|
newInfoAnnotationOutputsList.add(infoAnnotationOutputCopy); //Add the line to the new list of lines.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newInfoAnnotationOutputsList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a list of key-value pairs and returns a new Map containing only the columns which were requested by the user
|
||||||
|
* via the -s arg. If there was no -s arg that referenced the given bindingName, all annotationsForRecord returned untouched.
|
||||||
|
*
|
||||||
|
* @param bindingName The binding name for a particular ROD input file.
|
||||||
|
* @param annotationsForRecord The list of column_name -> value pairs for a particular record from the given input file.
|
||||||
|
*
|
||||||
|
* @return Map - see above.
|
||||||
|
*/
|
||||||
|
private Map<String, String> selectColumnsFromRecord( String bindingName, Map<String, String> annotationsForRecord) {
|
||||||
|
if(requestedColumnsMap == null || !requestedColumnsMap.containsKey(bindingName)) {
|
||||||
|
return annotationsForRecord;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> requestedColumns = requestedColumnsMap.get(bindingName);
|
||||||
|
Map<String, String> subsettedAnnotations = new HashMap<String, String>();
|
||||||
|
for(Map.Entry<String, String> e : annotationsForRecord.entrySet() ) {
|
||||||
|
if(requestedColumns.contains(e.getKey())) {
|
||||||
|
subsettedAnnotations.put(e.getKey(), e.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(subsettedAnnotations.isEmpty()) {
|
||||||
|
throw new StingException("Invalid -s argument for the '" + bindingName + "' input file. " +
|
||||||
|
"It caused all columns in the file to be rejected. Please check to make sure the -s column " +
|
||||||
|
"names match the column names in the '" + bindingName + "' file's HEADER line.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return subsettedAnnotations;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines how the engine will handle the case where multiple records in a ROD file
|
||||||
|
* overlap a particular single locus. If explode is set to true, the output will be
|
||||||
|
* one-to-many, so that each locus in the input VCF file could result in multiple
|
||||||
|
* entries in the output VCF file. Otherwise, the output will be one-to-one, and
|
||||||
|
* all multiple-match records will be collapsed into the single info field.
|
||||||
|
* The collapsing is done by appending an _i to each key name (where 'i' is a
|
||||||
|
* record counter).
|
||||||
|
*
|
||||||
|
* See class-level comments for more details.
|
||||||
|
*
|
||||||
|
* @param explode
|
||||||
|
*/
|
||||||
|
public void setExplode(boolean explode) {
|
||||||
|
this.explode = explode;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the columns that will be used for the info annotation field.
|
||||||
|
* Column names should be of the form bindingName.columnName (eg. dbsnp.avHet).
|
||||||
|
*
|
||||||
|
* @param columns An array of strings where each string is a comma-separated list
|
||||||
|
* of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ).
|
||||||
|
*/
|
||||||
|
public void setRequestedColumns(String[] columns) {
|
||||||
|
if(columns == null) {
|
||||||
|
throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg.");
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.err.println("COLUMNS: "+Arrays.asList(columns).toString());
|
||||||
|
|
||||||
|
this.requestedColumnsMap = GenomicAnnotation.parseColumnsArg(columns);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
|
|
@ -224,7 +225,8 @@ public class UnifiedGenotyperEngine {
|
||||||
// first off, we want to use the *unfiltered* context for the annotations
|
// first off, we want to use the *unfiltered* context for the annotations
|
||||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
|
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
|
||||||
|
|
||||||
call.vc = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
|
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
|
||||||
|
call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,163 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TODO comment
|
||||||
|
*/
|
||||||
|
public class GenomicAnnotation implements InfoFieldAnnotation {
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each ROD (aka. record) which overlaps the current locus, generates a
|
||||||
|
* set of annotations of the form:
|
||||||
|
*
|
||||||
|
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
|
||||||
|
*
|
||||||
|
* These annotations are stored in a Map<String, String>.
|
||||||
|
*
|
||||||
|
* Since a single input file can have multiple records that overlap the current
|
||||||
|
* locus (eg. dbSNP can have multiple entries for the same location), a different
|
||||||
|
* Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
|
||||||
|
* for each input file.
|
||||||
|
*
|
||||||
|
* The return value of this method is a Map of the form:
|
||||||
|
* rodName1 -> List<Map<String, String>>
|
||||||
|
* rodName2 -> List<Map<String, String>>
|
||||||
|
* rodName3 -> List<Map<String, String>>
|
||||||
|
* ...
|
||||||
|
*
|
||||||
|
* The List values are guaranteed to have size > 0, and in most cases will have size == 1.
|
||||||
|
*/
|
||||||
|
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||||
|
final ReferenceContext ref,
|
||||||
|
final Map<String, StratifiedAlignmentContext> stratifiedContexts,
|
||||||
|
final VariantContext vc) {
|
||||||
|
|
||||||
|
final Map<String, Object> annotations = new HashMap<String, Object>();
|
||||||
|
for(final GATKFeature gatkFeature : tracker.getAllRods())
|
||||||
|
{
|
||||||
|
final ReferenceOrderedDatum rod = (ReferenceOrderedDatum) gatkFeature.getUnderlyingObject();
|
||||||
|
if(! (rod instanceof TabularROD) ) {
|
||||||
|
continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
|
||||||
|
}
|
||||||
|
|
||||||
|
final String name = rod.getName();
|
||||||
|
if(name.equals("variant") || name.equals("interval")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final Map<String, String> annotationsForRecord = convertRecordToAnnotations( (TabularROD) rod );
|
||||||
|
|
||||||
|
List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get(name);
|
||||||
|
if(listOfMatchingRecords == null) {
|
||||||
|
listOfMatchingRecords = new LinkedList<Map<String,String>>();
|
||||||
|
listOfMatchingRecords.add( annotationsForRecord );
|
||||||
|
annotations.put(name, listOfMatchingRecords);
|
||||||
|
} else {
|
||||||
|
listOfMatchingRecords.add( annotationsForRecord );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return annotations;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts the ROD to a set of key-value pairs of the form:
|
||||||
|
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
|
||||||
|
* (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
|
||||||
|
*
|
||||||
|
* @param rod A TabularROD corresponding to one record in one input file.
|
||||||
|
*
|
||||||
|
* @return The map of column-name -> value pairs.
|
||||||
|
*/
|
||||||
|
private Map<String, String> convertRecordToAnnotations( final TabularROD rod ) {
|
||||||
|
final String rodName = rod.getName(); //aka the rod binding
|
||||||
|
|
||||||
|
final Map<String, String> result = new HashMap<String, String>();
|
||||||
|
for(final Entry<String, String> entry : rod.entrySet()) {
|
||||||
|
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
|
||||||
|
return rodBindingName + "." + columnName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the columns arg and returns a Map of columns hashed by their binding name.
|
||||||
|
* For example:
|
||||||
|
* The command line:
|
||||||
|
* -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd
|
||||||
|
*
|
||||||
|
* will be passed to this method as:
|
||||||
|
* ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"]
|
||||||
|
*
|
||||||
|
* resulting in a return value of:
|
||||||
|
* {
|
||||||
|
* "dbSnp" -> "dbSnp.valid" ,
|
||||||
|
* "dbSnp" -> "dbsnp.avHet" ,
|
||||||
|
* "refGene" -> "refGene.txStart",
|
||||||
|
* "refGene" -> "refGene.txEnd"
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* @param columnsArg The -s command line arg value.
|
||||||
|
*
|
||||||
|
* @return Map representing a parsed version of this arg - see above.
|
||||||
|
*/
|
||||||
|
public static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
|
||||||
|
Map<String, Set<String>> result = new HashMap<String, Set<String>>();
|
||||||
|
|
||||||
|
for(String s : columnsArg) {
|
||||||
|
for(String columnSpecifier : s.split(",") ) {
|
||||||
|
String[] rodNameColumnName = columnSpecifier.split("\\.");
|
||||||
|
if(rodNameColumnName.length != 2) {
|
||||||
|
throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'.");
|
||||||
|
}
|
||||||
|
String rodName = rodNameColumnName[0];
|
||||||
|
//String columnName = rodNameColumnName[1];
|
||||||
|
|
||||||
|
Set<String> requestedColumns = result.get(rodName);
|
||||||
|
if(requestedColumns == null) {
|
||||||
|
requestedColumns = new HashSet<String>();
|
||||||
|
result.put(rodName, requestedColumns);
|
||||||
|
}
|
||||||
|
requestedColumns.add(columnSpecifier);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFInfoHeaderLine getDescription() {
|
||||||
|
return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. ");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getKeyName() {
|
||||||
|
return "GenericAnnotation";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,174 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Allows;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.By;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Annotates variant calls with context information. Users can specify which of the available annotations to use.
|
||||||
|
*/
|
||||||
|
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
|
||||||
|
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||||
|
@Reference(window=@Window(start=-50,stop=50))
|
||||||
|
@By(DataSource.REFERENCE)
|
||||||
|
public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
|
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
|
||||||
|
protected File VCF_OUT;
|
||||||
|
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
|
||||||
|
protected String sampleName = null;
|
||||||
|
|
||||||
|
@Argument(fullName="select", shortName="s", doc="Select which columns to use for each ROD file. Column #s are 0-based. (eg. The following will select columns 5,6,2 from file1.txt and columns 3,7 from file2.txt: -B my-rod,table,/path/file1.txt -B my-rod2,table,/path/file2.txt -S my-rod={5,6,2} -S my-rod2={3,7})", required=false)
|
||||||
|
protected String[] COLUMNS = {};
|
||||||
|
|
||||||
|
@Argument(fullName="explode", shortName="exp", doc="If more than one record from the same file matches a particular locus, create multiple entries in the ouptut file - one for each match. WARNING: This could lead to combinatorial explotion if more than one file have more than one match at a particular locus.", required=false)
|
||||||
|
protected Boolean EXPLODE = false;
|
||||||
|
|
||||||
|
private VCFWriter vcfWriter;
|
||||||
|
|
||||||
|
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
|
||||||
|
|
||||||
|
private VariantAnnotatorEngine engine;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare the output file and the list of available features.
|
||||||
|
*/
|
||||||
|
public void initialize() {
|
||||||
|
|
||||||
|
// get the list of all sample names from the various VCF input rods
|
||||||
|
TreeSet<String> samples = new TreeSet<String>();
|
||||||
|
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||||
|
|
||||||
|
// add the non-VCF sample from the command-line, if applicable
|
||||||
|
if ( sampleName != null ) {
|
||||||
|
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
|
||||||
|
samples.add(sampleName.toUpperCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there are no valid samples, warn the user
|
||||||
|
if ( samples.size() == 0 ) {
|
||||||
|
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
|
||||||
|
}
|
||||||
|
|
||||||
|
engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
|
||||||
|
|
||||||
|
engine.setExplode( Boolean.TRUE.equals( EXPLODE ) );
|
||||||
|
engine.setRequestedColumns(COLUMNS);
|
||||||
|
|
||||||
|
// setup the header fields
|
||||||
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||||
|
hInfo.add(new VCFHeaderLine("source", "Annotator"));
|
||||||
|
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
|
||||||
|
hInfo.addAll(engine.getVCFAnnotationDescriptions());
|
||||||
|
|
||||||
|
vcfWriter = new VCFWriter(VCF_OUT);
|
||||||
|
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||||
|
vcfWriter.writeHeader(vcfHeader);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the number of loci processed to zero.
|
||||||
|
*
|
||||||
|
* @return 0
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We want reads that span deletions
|
||||||
|
*
|
||||||
|
* @return true
|
||||||
|
*/
|
||||||
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each site of interest, annotate based on the requested annotation types
|
||||||
|
*
|
||||||
|
* @param tracker the meta-data tracker
|
||||||
|
* @param ref the reference base
|
||||||
|
* @param context the context for the given locus
|
||||||
|
* @return 1 if the locus was successfully processed, 0 if otherwise
|
||||||
|
*/
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
|
List<Object> rods = tracker.getReferenceMetaData("variant");
|
||||||
|
// ignore places where we don't have a variant
|
||||||
|
if ( rods.size() == 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
Object variant = rods.get(0);
|
||||||
|
VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant);
|
||||||
|
if ( vc == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// if the reference base is not ambiguous, we can annotate
|
||||||
|
Collection<VariantContext> annotatedVCs = null;
|
||||||
|
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
||||||
|
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
||||||
|
if ( stratifiedContexts != null ) {
|
||||||
|
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(VariantContext annotatedVC : annotatedVCs) {
|
||||||
|
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increment the number of loci processed.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return the new number of loci processed.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return sum + value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tell the user the number of loci processed and close out the new variants file.
|
||||||
|
*
|
||||||
|
* @param result the number of loci seen.
|
||||||
|
*/
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
out.printf("Processed %d loci.\n", result);
|
||||||
|
|
||||||
|
vcfWriter.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue