Improved handling of haplotypeReference and haplotypeAlternate columns. Added haplotypeStrand column. Improved handling of empty fields in data files.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3166 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
04c22a6640
commit
c0f4695902
|
|
@ -12,11 +12,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
||||
|
||||
|
|
@ -24,40 +27,53 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
|||
|
||||
|
||||
/**
|
||||
* TODO comment
|
||||
* This plugin for {@link VariantAnnotatorEngine} serves as the core
|
||||
* of the {@link GenomicAnnotator}. It finds all records in the -B input files
|
||||
* that match the given variant's position and, optionally, it's reference and alternate alleles.
|
||||
* Whether or not matching is done by reference and alternate alleles for a particular input file
|
||||
* based solely on whether the given -B input has columns named "haplotypeReference" and
|
||||
* "haplotypeAlternate".
|
||||
*/
|
||||
public class GenomicAnnotation implements InfoFieldAnnotation {
|
||||
|
||||
private static final String HAP_REF = "hap_ref";
|
||||
private static final String HAP_ALT = "hap_alt";
|
||||
private static final String HAPLOTYPE_REFERENCE_COLUMN = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
|
||||
private static final String HAPLOTYPE_ALTERNATE_COLUMN = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
|
||||
private static final String HAPLOTYPE_STRAND_COLUMN = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN;
|
||||
|
||||
|
||||
/**
|
||||
* For each ROD (aka. record) which overlaps the current locus, generates a
|
||||
* set of annotations of the form:
|
||||
*
|
||||
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
|
||||
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
|
||||
*
|
||||
* These annotations are stored in a Map<String, String>.
|
||||
* Examples of generated annotations are: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc.
|
||||
*
|
||||
* @return The following is an explanation of this method's return value:
|
||||
* The annotations (aka. columnName=fieldValue pairs) from a matching record in a particular file are stored in a Map<String, String>.
|
||||
* Since a single input file can have multiple records that overlap the current
|
||||
* locus (eg. dbSNP can have multiple entries for the same location), a different
|
||||
* Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
|
||||
* for each input file.
|
||||
* Map<String, String> is created for each matching record in a particular file.
|
||||
* The list of matching records for each file is then represented as a List<Map<String, String>>
|
||||
*
|
||||
* The return value of this method is a Map of the form:
|
||||
* The return value of this method is a Map<String, Object> of the form:
|
||||
* rodName1 -> List<Map<String, String>>
|
||||
* rodName2 -> List<Map<String, String>>
|
||||
* rodName3 -> List<Map<String, String>>
|
||||
* ...
|
||||
* Where the rodNames are the -B binding names for each file that were specified on the command line.
|
||||
*
|
||||
* The List values are guaranteed to have size > 0, and in most cases will have size == 1.
|
||||
* NOTE: The lists (List<Map<String, String>>) are guaranteed to have size > 0.
|
||||
* The reason is that a rodName -> List<Map<String, String>> entry will only
|
||||
* be created in Map<String, Object> if the List has at least one element.
|
||||
*/
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, StratifiedAlignmentContext> stratifiedContexts,
|
||||
final VariantContext vc) {
|
||||
|
||||
//iterate over each record that overlaps the current locus, and, if it passes certain filters,
|
||||
//add its values to the list of annotations for this locus.
|
||||
final Map<String, Object> annotations = new HashMap<String, Object>();
|
||||
for(final GATKFeature gatkFeature : tracker.getAllRods())
|
||||
{
|
||||
|
|
@ -68,43 +84,90 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
|
|||
}
|
||||
|
||||
if( ! (rod instanceof TabularROD) ) {
|
||||
continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
|
||||
continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns.
|
||||
}
|
||||
|
||||
TabularROD tabularRod = (TabularROD) rod;
|
||||
final Map<String, String> annotationsForRecord = convertRecordToAnnotations( tabularRod );
|
||||
|
||||
//filter by matching the vc's reference allele and alternate allele to the record's HAP_REF and HAP_ALT columns.
|
||||
final String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_ALT) );
|
||||
if(hapAltValue != null && !hapAltValue.equals("*")) {
|
||||
Set<Allele> alternateAlleles = vc.getAlternateAlleles();
|
||||
if(alternateAlleles.isEmpty()) {
|
||||
//TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right?
|
||||
continue;
|
||||
} else if(alternateAlleles.size() > 1) {
|
||||
throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele.");
|
||||
}
|
||||
//If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the
|
||||
//alleles specified match the the variant's reference allele and alternate allele.
|
||||
//If they don't match, this record will be skipped, and its values will not be used for annotations.
|
||||
//
|
||||
//If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match.
|
||||
//Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two
|
||||
//are exactly equal (case-insensitive).
|
||||
|
||||
//TODO handle strand somehow?
|
||||
//The HAPLOTYPE_REFERENCE_COLUMN is matches the variant's reference allele based on a case-insensitive string comparison.
|
||||
//The HAPLOTYPE_ALTERNATE_COLUMN is can optionally list more than allele separated by one of these chars: ,\/:|
|
||||
//The matches if any of the
|
||||
String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) );
|
||||
if(hapAltValue != null)
|
||||
{
|
||||
if(!hapAltValue.equals("*"))
|
||||
{
|
||||
Set<Allele> alternateAlleles = vc.getAlternateAlleles();
|
||||
//if(alternateAlleles.isEmpty()) {
|
||||
//handle a site that has been called monomorphic reference
|
||||
//alternateAlleles.add(vc.getReference());
|
||||
//continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right?
|
||||
//} else
|
||||
if(alternateAlleles.size() > 1) {
|
||||
throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele.");
|
||||
}
|
||||
|
||||
//decide whether to skip this record.
|
||||
Allele vcAlt = alternateAlleles.iterator().next();
|
||||
if(!vcAlt.basesMatch(hapAltValue)) {
|
||||
//TODO check the intersection of vcAlt and hapAltValue
|
||||
continue; //skip record
|
||||
boolean positiveStrand = true; //if HAPLOTYPE_STRAND_COLUMN isn't specified, assume positive strand.
|
||||
String hapStrandValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_STRAND_COLUMN) );
|
||||
if(hapStrandValue != null ) {
|
||||
hapStrandValue = hapStrandValue.trim().toLowerCase();
|
||||
if(hapStrandValue.equals("-") || hapStrandValue.equals("r")) {
|
||||
positiveStrand = false;
|
||||
} else if(!hapStrandValue.equals("+") && !hapStrandValue.equals("f")) {
|
||||
throw new StingException("Record (" + rod + ") in " + name + " has an invalid value for " + HAPLOTYPE_STRAND_COLUMN + ". This value is: \"" + hapStrandValue + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Allele vcAlt;
|
||||
if(alternateAlleles.isEmpty()) {
|
||||
vcAlt = vc.getReference();
|
||||
} else {
|
||||
vcAlt = alternateAlleles.iterator().next();
|
||||
}
|
||||
|
||||
boolean matchFound = false;
|
||||
for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) {
|
||||
if(!positiveStrand) {
|
||||
hapAlt = BaseUtils.simpleReverseComplement(hapAlt);
|
||||
}
|
||||
|
||||
if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) {
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!matchFound) {
|
||||
continue; //skip record - none of its alternate alleles match the variant's alternate allele
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_REF) );
|
||||
if(hapRefValue != null && !hapRefValue.equals("*")) {
|
||||
//match against hapRef
|
||||
Allele vcRef = vc.getReference();
|
||||
if(!vcRef.basesMatch(hapRefValue)) {
|
||||
//TODO check the intersection of vcRef and hapRefValue
|
||||
continue; //skip record
|
||||
String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) );
|
||||
if(hapRefValue != null)
|
||||
{
|
||||
hapRefValue = hapRefValue.trim();
|
||||
if(!hapRefValue.equals("*"))
|
||||
{
|
||||
//match against hapolotypeReference.
|
||||
Allele vcRef = vc.getReference();
|
||||
if(!vcRef.basesMatch(hapRefValue)) {
|
||||
//TODO check the intersection of vcRef and hapRefValue
|
||||
continue; //skip record
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//filters passed, so add this record.
|
||||
List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get( name );
|
||||
if(listOfMatchingRecords == null) {
|
||||
listOfMatchingRecords = new LinkedList<Map<String,String>>();
|
||||
|
|
@ -135,7 +198,10 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
|
|||
|
||||
final Map<String, String> result = new HashMap<String, String>();
|
||||
for(final Entry<String, String> entry : rod.entrySet()) {
|
||||
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
|
||||
final String value = entry.getValue();
|
||||
if(!value.trim().isEmpty()) {
|
||||
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
|||
Loading…
Reference in New Issue