First checkin of GenomicAnnotator which annotates an input VCF file by pulling data in a generic way from an arbitrary set of TabularRODs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3114 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
699a0ea9d1
commit
6b7b07f178
|
|
@ -123,7 +123,7 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
|
|||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* We want reads that span deletions
|
||||
*
|
||||
|
|
@ -154,16 +154,20 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
|
|||
return 0;
|
||||
|
||||
// if the reference base is not ambiguous, we can annotate
|
||||
Collection<VariantContext> annotatedVCs = Arrays.asList( new VariantContext[] { vc } );
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
||||
if ( stratifiedContexts != null ) {
|
||||
vc = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
||||
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
||||
}
|
||||
}
|
||||
|
||||
if ( variant instanceof RodVCF ) {
|
||||
for(VariantContext annotatedVC : annotatedVCs ) {
|
||||
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
||||
}
|
||||
}
|
||||
|
||||
if ( variant instanceof RodVCF )
|
||||
vcfWriter.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.annotator.GenomicAnnotation;
|
||||
import org.broadinstitute.sting.utils.PackageUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||
|
|
@ -33,6 +34,12 @@ public class VariantAnnotatorEngine {
|
|||
// how about hapmap3?
|
||||
private boolean annotateHapmap3 = false;
|
||||
|
||||
// command-line option used for GenomicAnnotation.
|
||||
private Map<String, Set<String>> requestedColumnsMap;
|
||||
|
||||
// command-line option used for GenomicAnnotation.
|
||||
private boolean explode;
|
||||
|
||||
|
||||
// use this constructor if you want all possible annotations
|
||||
public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
|
||||
|
|
@ -143,7 +150,7 @@ public class VariantAnnotatorEngine {
|
|||
return descriptions;
|
||||
}
|
||||
|
||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
public Collection<VariantContext> annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
Map<String, Object> infoAnnotations = new HashMap<String, Object>(vc.getAttributes());
|
||||
|
||||
|
|
@ -166,12 +173,50 @@ public class VariantAnnotatorEngine {
|
|||
infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1");
|
||||
}
|
||||
|
||||
for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
|
||||
Map<String, Object> result = annotation.annotate(tracker, ref, stratifiedContexts, vc);
|
||||
if ( result != null )
|
||||
infoAnnotations.putAll(result);
|
||||
|
||||
//Process the info field
|
||||
List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
|
||||
infoAnnotationOutputsList.add(new HashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file gains 1 line.
|
||||
|
||||
//go through all the requested info annotationTypes
|
||||
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
|
||||
{
|
||||
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
|
||||
if ( annotationsFromCurrentType == null ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(annotationType instanceof GenomicAnnotation)
|
||||
{
|
||||
//go through the annotations returned by GenericAnnotation for each -B input file.
|
||||
for( Map.Entry<String, Object> annotationsFromInputFile : annotationsFromCurrentType.entrySet() )
|
||||
{
|
||||
final String inputFileBindingName = annotationsFromInputFile.getKey();
|
||||
final List<Map<String, String>> matchingRecords = (List<Map<String, String>>) annotationsFromInputFile.getValue();
|
||||
|
||||
if( matchingRecords.size() > 1 && explode)
|
||||
{
|
||||
//More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size().
|
||||
infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
|
||||
}
|
||||
else
|
||||
{
|
||||
//This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will
|
||||
//all be added to the same output line, with keys disambiguated by appending _i .
|
||||
addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//add the annotations to each output line.
|
||||
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||
infoAnnotationOutput.putAll(annotationsFromCurrentType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Process genotypes
|
||||
Map<String, Genotype> genotypes;
|
||||
if ( requestedGenotypeAnnotations.size() == 0 ) {
|
||||
genotypes = vc.getGenotypes();
|
||||
|
|
@ -195,6 +240,161 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
}
|
||||
|
||||
return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotations);
|
||||
//Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
|
||||
Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
|
||||
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||
returnValue.add( new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotationOutput) );
|
||||
}
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implements non-explode mode, where the output lines have a one-to-one relationship
|
||||
* with the input variants, and all multiple-match records are collapsed into the single info field.
|
||||
* The collapsing is done by appending an _i to each key name (where 'i' is a record counter).
|
||||
*
|
||||
* @param infoAnnotationOutputsList
|
||||
* @param matchingRecords
|
||||
* @param bindingName
|
||||
*/
|
||||
private void addToExistingAnnotationOutputs(
|
||||
final List<Map<String, Object>> infoAnnotationOutputsList,
|
||||
final List<Map<String, String>> matchingRecords,
|
||||
final String bindingName) {
|
||||
//For each matching record, just add its annotations to all existing output lines.
|
||||
final boolean renameKeys = matchingRecords.size() > 1;
|
||||
for(int i = 0; i < matchingRecords.size(); i++) {
|
||||
Map<String,String> annotationsForRecord = matchingRecords.get(i);
|
||||
annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
|
||||
|
||||
if(renameKeys) {
|
||||
//Rename keys to avoid naming conflicts (eg. if you have multiple dbsnp matches,
|
||||
// dbSNP.avHet=value1 from record 1 and dbSNP.avHet=value2 from record 2 will become dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 )
|
||||
Map<String,String> annotationsForRecordWithRenamedKeys = new HashMap<String, String>();
|
||||
for(Map.Entry<String, String> annotation : annotationsForRecord.entrySet()) {
|
||||
annotationsForRecordWithRenamedKeys.put(annotation.getKey() + "_" + i, annotation.getValue());
|
||||
}
|
||||
|
||||
annotationsForRecord = annotationsForRecordWithRenamedKeys;
|
||||
}
|
||||
|
||||
//Add the annotations from this record to each output line.
|
||||
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||
infoAnnotationOutput.putAll(annotationsForRecord);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements "explode" mode. Takes the current list of
|
||||
* infoAnnotationOutputs (each element of will end up in a different line
|
||||
* of the output VCF file), and generates/returns a new list of infoAnnotationOutputs
|
||||
* which contain one copy of the current infoAnnotationOutputs for each record
|
||||
* in matching records. The returned list will have size:
|
||||
*
|
||||
* infoAnnotationOutputsList.size() * matchingRecords.size()
|
||||
*
|
||||
* See class-level comments for more details.
|
||||
*
|
||||
* @param infoAnnotationOutputsList
|
||||
* @param matchingRecords
|
||||
* @param bindingName
|
||||
* @return
|
||||
*/
|
||||
private List<Map<String, Object>> explodeInfoAnnotationOutputsList(
|
||||
final List<Map<String, Object>> infoAnnotationOutputsList,
|
||||
final List<Map<String, String>> matchingRecords,
|
||||
final String bindingName) {
|
||||
|
||||
|
||||
//This is the return value. It represents the new list of lines in the output VCF file.
|
||||
final List<Map<String, Object>> newInfoAnnotationOutputsList = new LinkedList<Map<String, Object>>();
|
||||
|
||||
//For each matching record, generate a new output line
|
||||
for(int i = 0; i < matchingRecords.size(); i++) {
|
||||
Map<String,String> annotationsForRecord = matchingRecords.get(i);
|
||||
annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
|
||||
|
||||
//Add the annotations from this record to each output line.
|
||||
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
|
||||
Map<String, Object> infoAnnotationOutputCopy = new HashMap<String, Object>(infoAnnotationOutput); //create a new copy of this line.
|
||||
infoAnnotationOutputCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line.
|
||||
|
||||
newInfoAnnotationOutputsList.add(infoAnnotationOutputCopy); //Add the line to the new list of lines.
|
||||
}
|
||||
}
|
||||
|
||||
return newInfoAnnotationOutputsList;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Takes a list of key-value pairs and returns a new Map containing only the columns which were requested by the user
|
||||
* via the -s arg. If there was no -s arg that referenced the given bindingName, all annotationsForRecord returned untouched.
|
||||
*
|
||||
* @param bindingName The binding name for a particular ROD input file.
|
||||
* @param annotationsForRecord The list of column_name -> value pairs for a particular record from the given input file.
|
||||
*
|
||||
* @return Map - see above.
|
||||
*/
|
||||
private Map<String, String> selectColumnsFromRecord( String bindingName, Map<String, String> annotationsForRecord) {
|
||||
if(requestedColumnsMap == null || !requestedColumnsMap.containsKey(bindingName)) {
|
||||
return annotationsForRecord;
|
||||
}
|
||||
|
||||
Set<String> requestedColumns = requestedColumnsMap.get(bindingName);
|
||||
Map<String, String> subsettedAnnotations = new HashMap<String, String>();
|
||||
for(Map.Entry<String, String> e : annotationsForRecord.entrySet() ) {
|
||||
if(requestedColumns.contains(e.getKey())) {
|
||||
subsettedAnnotations.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
if(subsettedAnnotations.isEmpty()) {
|
||||
throw new StingException("Invalid -s argument for the '" + bindingName + "' input file. " +
|
||||
"It caused all columns in the file to be rejected. Please check to make sure the -s column " +
|
||||
"names match the column names in the '" + bindingName + "' file's HEADER line.");
|
||||
}
|
||||
|
||||
return subsettedAnnotations;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Determines how the engine will handle the case where multiple records in a ROD file
|
||||
* overlap a particular single locus. If explode is set to true, the output will be
|
||||
* one-to-many, so that each locus in the input VCF file could result in multiple
|
||||
* entries in the output VCF file. Otherwise, the output will be one-to-one, and
|
||||
* all multiple-match records will be collapsed into the single info field.
|
||||
* The collapsing is done by appending an _i to each key name (where 'i' is a
|
||||
* record counter).
|
||||
*
|
||||
* See class-level comments for more details.
|
||||
*
|
||||
* @param explode
|
||||
*/
|
||||
public void setExplode(boolean explode) {
|
||||
this.explode = explode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the columns that will be used for the info annotation field.
|
||||
* Column names should be of the form bindingName.columnName (eg. dbsnp.avHet).
|
||||
*
|
||||
* @param columns An array of strings where each string is a comma-separated list
|
||||
* of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ).
|
||||
*/
|
||||
public void setRequestedColumns(String[] columns) {
|
||||
if(columns == null) {
|
||||
throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg.");
|
||||
}
|
||||
|
||||
//System.err.println("COLUMNS: "+Arrays.asList(columns).toString());
|
||||
|
||||
this.requestedColumnsMap = GenomicAnnotation.parseColumnsArg(columns);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
|
|
@ -224,7 +225,8 @@ public class UnifiedGenotyperEngine {
|
|||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
|
||||
|
||||
call.vc = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
|
||||
call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,163 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
||||
|
||||
/**
|
||||
* TODO comment
|
||||
*/
|
||||
public class GenomicAnnotation implements InfoFieldAnnotation {
|
||||
|
||||
|
||||
/**
|
||||
* For each ROD (aka. record) which overlaps the current locus, generates a
|
||||
* set of annotations of the form:
|
||||
*
|
||||
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
|
||||
*
|
||||
* These annotations are stored in a Map<String, String>.
|
||||
*
|
||||
* Since a single input file can have multiple records that overlap the current
|
||||
* locus (eg. dbSNP can have multiple entries for the same location), a different
|
||||
* Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
|
||||
* for each input file.
|
||||
*
|
||||
* The return value of this method is a Map of the form:
|
||||
* rodName1 -> List<Map<String, String>>
|
||||
* rodName2 -> List<Map<String, String>>
|
||||
* rodName3 -> List<Map<String, String>>
|
||||
* ...
|
||||
*
|
||||
* The List values are guaranteed to have size > 0, and in most cases will have size == 1.
|
||||
*/
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, StratifiedAlignmentContext> stratifiedContexts,
|
||||
final VariantContext vc) {
|
||||
|
||||
final Map<String, Object> annotations = new HashMap<String, Object>();
|
||||
for(final GATKFeature gatkFeature : tracker.getAllRods())
|
||||
{
|
||||
final ReferenceOrderedDatum rod = (ReferenceOrderedDatum) gatkFeature.getUnderlyingObject();
|
||||
if(! (rod instanceof TabularROD) ) {
|
||||
continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
|
||||
}
|
||||
|
||||
final String name = rod.getName();
|
||||
if(name.equals("variant") || name.equals("interval")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
final Map<String, String> annotationsForRecord = convertRecordToAnnotations( (TabularROD) rod );
|
||||
|
||||
List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get(name);
|
||||
if(listOfMatchingRecords == null) {
|
||||
listOfMatchingRecords = new LinkedList<Map<String,String>>();
|
||||
listOfMatchingRecords.add( annotationsForRecord );
|
||||
annotations.put(name, listOfMatchingRecords);
|
||||
} else {
|
||||
listOfMatchingRecords.add( annotationsForRecord );
|
||||
}
|
||||
}
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Converts the ROD to a set of key-value pairs of the form:
|
||||
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
|
||||
* (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
|
||||
*
|
||||
* @param rod A TabularROD corresponding to one record in one input file.
|
||||
*
|
||||
* @return The map of column-name -> value pairs.
|
||||
*/
|
||||
private Map<String, String> convertRecordToAnnotations( final TabularROD rod ) {
|
||||
final String rodName = rod.getName(); //aka the rod binding
|
||||
|
||||
final Map<String, String> result = new HashMap<String, String>();
|
||||
for(final Entry<String, String> entry : rod.entrySet()) {
|
||||
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
|
||||
return rodBindingName + "." + columnName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parses the columns arg and returns a Map of columns hashed by their binding name.
|
||||
* For example:
|
||||
* The command line:
|
||||
* -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd
|
||||
*
|
||||
* will be passed to this method as:
|
||||
* ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"]
|
||||
*
|
||||
* resulting in a return value of:
|
||||
* {
|
||||
* "dbSnp" -> "dbSnp.valid" ,
|
||||
* "dbSnp" -> "dbsnp.avHet" ,
|
||||
* "refGene" -> "refGene.txStart",
|
||||
* "refGene" -> "refGene.txEnd"
|
||||
* }
|
||||
*
|
||||
* @param columnsArg The -s command line arg value.
|
||||
*
|
||||
* @return Map representing a parsed version of this arg - see above.
|
||||
*/
|
||||
public static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
|
||||
Map<String, Set<String>> result = new HashMap<String, Set<String>>();
|
||||
|
||||
for(String s : columnsArg) {
|
||||
for(String columnSpecifier : s.split(",") ) {
|
||||
String[] rodNameColumnName = columnSpecifier.split("\\.");
|
||||
if(rodNameColumnName.length != 2) {
|
||||
throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'.");
|
||||
}
|
||||
String rodName = rodNameColumnName[0];
|
||||
//String columnName = rodNameColumnName[1];
|
||||
|
||||
Set<String> requestedColumns = result.get(rodName);
|
||||
if(requestedColumns == null) {
|
||||
requestedColumns = new HashSet<String>();
|
||||
result.put(rodName, requestedColumns);
|
||||
}
|
||||
requestedColumns.add(columnSpecifier);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public VCFInfoHeaderLine getDescription() {
|
||||
return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. ");
|
||||
}
|
||||
|
||||
public String getKeyName() {
|
||||
return "GenericAnnotation";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.walkers.Allows;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
|
||||
|
||||
/**
|
||||
* Annotates variant calls with context information. Users can specify which of the available annotations to use.
|
||||
*/
|
||||
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
|
||||
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
@By(DataSource.REFERENCE)
|
||||
public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
|
||||
protected File VCF_OUT;
|
||||
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
|
||||
protected String sampleName = null;
|
||||
|
||||
@Argument(fullName="select", shortName="s", doc="Select which columns to use for each ROD file. Column #s are 0-based. (eg. The following will select columns 5,6,2 from file1.txt and columns 3,7 from file2.txt: -B my-rod,table,/path/file1.txt -B my-rod2,table,/path/file2.txt -S my-rod={5,6,2} -S my-rod2={3,7})", required=false)
|
||||
protected String[] COLUMNS = {};
|
||||
|
||||
@Argument(fullName="explode", shortName="exp", doc="If more than one record from the same file matches a particular locus, create multiple entries in the ouptut file - one for each match. WARNING: This could lead to combinatorial explotion if more than one file have more than one match at a particular locus.", required=false)
|
||||
protected Boolean EXPLODE = false;
|
||||
|
||||
private VCFWriter vcfWriter;
|
||||
|
||||
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
|
||||
|
||||
private VariantAnnotatorEngine engine;
|
||||
|
||||
|
||||
/**
|
||||
* Prepare the output file and the list of available features.
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
// get the list of all sample names from the various VCF input rods
|
||||
TreeSet<String> samples = new TreeSet<String>();
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||
|
||||
// add the non-VCF sample from the command-line, if applicable
|
||||
if ( sampleName != null ) {
|
||||
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
|
||||
samples.add(sampleName.toUpperCase());
|
||||
}
|
||||
|
||||
// if there are no valid samples, warn the user
|
||||
if ( samples.size() == 0 ) {
|
||||
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
|
||||
}
|
||||
|
||||
engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
|
||||
|
||||
engine.setExplode( Boolean.TRUE.equals( EXPLODE ) );
|
||||
engine.setRequestedColumns(COLUMNS);
|
||||
|
||||
// setup the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "Annotator"));
|
||||
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
|
||||
hInfo.addAll(engine.getVCFAnnotationDescriptions());
|
||||
|
||||
vcfWriter = new VCFWriter(VCF_OUT);
|
||||
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
|
||||
/**
|
||||
* We want reads that span deletions
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
/**
|
||||
* For each site of interest, annotate based on the requested annotation types
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return 1 if the locus was successfully processed, 0 if otherwise
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
|
||||
List<Object> rods = tracker.getReferenceMetaData("variant");
|
||||
// ignore places where we don't have a variant
|
||||
if ( rods.size() == 0 )
|
||||
return 0;
|
||||
|
||||
Object variant = rods.get(0);
|
||||
VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant);
|
||||
if ( vc == null )
|
||||
return 0;
|
||||
|
||||
// if the reference base is not ambiguous, we can annotate
|
||||
Collection<VariantContext> annotatedVCs = null;
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
||||
if ( stratifiedContexts != null ) {
|
||||
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
|
||||
}
|
||||
}
|
||||
|
||||
for(VariantContext annotatedVC : annotatedVCs) {
|
||||
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of loci processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of loci processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell the user the number of loci processed and close out the new variants file.
|
||||
*
|
||||
* @param result the number of loci seen.
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
out.printf("Processed %d loci.\n", result);
|
||||
|
||||
vcfWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue