Improved handling of haplotypeReference and haplotypeAlternate columns. Added haplotypeStrand column. Improved handling of empty fields in data files.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3166 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
weisburd 2010-04-14 14:42:19 +00:00
parent 04c22a6640
commit c0f4695902
1 changed files with 100 additions and 34 deletions

View File

@ -12,11 +12,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.TabularROD;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
@ -24,40 +27,53 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
/**
* TODO comment
* This plugin for {@link VariantAnnotatorEngine} serves as the core
* of the {@link GenomicAnnotator}. It finds all records in the -B input files
* that match the given variant's position and, optionally, it's reference and alternate alleles.
* Whether or not matching is done by reference and alternate alleles for a particular input file
* based solely on whether the given -B input has columns named "haplotypeReference" and
* "haplotypeAlternate".
*/
public class GenomicAnnotation implements InfoFieldAnnotation {
private static final String HAP_REF = "hap_ref";
private static final String HAP_ALT = "hap_alt";
private static final String HAPLOTYPE_REFERENCE_COLUMN = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
private static final String HAPLOTYPE_ALTERNATE_COLUMN = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
private static final String HAPLOTYPE_STRAND_COLUMN = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN;
/**
* For each ROD (aka. record) which overlaps the current locus, generates a
* set of annotations of the form:
*
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
* thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
*
* These annotations are stored in a Map<String, String>.
* Examples of generated annotations are: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc.
*
* @return The following is an explanation of this method's return value:
* The annotations (aka. columnName=fieldValue pairs) from a matching record in a particular file are stored in a Map<String, String>.
* Since a single input file can have multiple records that overlap the current
* locus (eg. dbSNP can have multiple entries for the same location), a different
* Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
* for each input file.
* Map<String, String> is created for each matching record in a particular file.
* The list of matching records for each file is then represented as a List<Map<String, String>>
*
* The return value of this method is a Map of the form:
* The return value of this method is a Map<String, Object> of the form:
* rodName1 -> List<Map<String, String>>
* rodName2 -> List<Map<String, String>>
* rodName3 -> List<Map<String, String>>
* ...
* Where the rodNames are the -B binding names for each file that were specified on the command line.
*
* The List values are guaranteed to have size > 0, and in most cases will have size == 1.
* NOTE: The lists (List<Map<String, String>>) are guaranteed to have size > 0.
* The reason is that a rodName -> List<Map<String, String>> entry will only
* be created in Map<String, Object> if the List has at least one element.
*/
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, StratifiedAlignmentContext> stratifiedContexts,
final VariantContext vc) {
//iterate over each record that overlaps the current locus, and, if it passes certain filters,
//add its values to the list of annotations for this locus.
final Map<String, Object> annotations = new HashMap<String, Object>();
for(final GATKFeature gatkFeature : tracker.getAllRods())
{
@ -68,43 +84,90 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
}
if( ! (rod instanceof TabularROD) ) {
continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns.
}
TabularROD tabularRod = (TabularROD) rod;
final Map<String, String> annotationsForRecord = convertRecordToAnnotations( tabularRod );
//filter by matching the vc's reference allele and alternate allele to the record's HAP_REF and HAP_ALT columns.
final String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_ALT) );
if(hapAltValue != null && !hapAltValue.equals("*")) {
Set<Allele> alternateAlleles = vc.getAlternateAlleles();
if(alternateAlleles.isEmpty()) {
//TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right?
continue;
} else if(alternateAlleles.size() > 1) {
throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele.");
}
//If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the
//alleles specified match the the variant's reference allele and alternate allele.
//If they don't match, this record will be skipped, and its values will not be used for annotations.
//
//If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match.
//Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two
//are exactly equal (case-insensitive).
//TODO handle strand somehow?
//The HAPLOTYPE_REFERENCE_COLUMN is matches the variant's reference allele based on a case-insensitive string comparison.
//The HAPLOTYPE_ALTERNATE_COLUMN is can optionally list more than allele separated by one of these chars: ,\/:|
//The matches if any of the
String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) );
if(hapAltValue != null)
{
if(!hapAltValue.equals("*"))
{
Set<Allele> alternateAlleles = vc.getAlternateAlleles();
//if(alternateAlleles.isEmpty()) {
//handle a site that has been called monomorphic reference
//alternateAlleles.add(vc.getReference());
//continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right?
//} else
if(alternateAlleles.size() > 1) {
throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele.");
}
//decide whether to skip this record.
Allele vcAlt = alternateAlleles.iterator().next();
if(!vcAlt.basesMatch(hapAltValue)) {
//TODO check the intersection of vcAlt and hapAltValue
continue; //skip record
boolean positiveStrand = true; //if HAPLOTYPE_STRAND_COLUMN isn't specified, assume positive strand.
String hapStrandValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_STRAND_COLUMN) );
if(hapStrandValue != null ) {
hapStrandValue = hapStrandValue.trim().toLowerCase();
if(hapStrandValue.equals("-") || hapStrandValue.equals("r")) {
positiveStrand = false;
} else if(!hapStrandValue.equals("+") && !hapStrandValue.equals("f")) {
throw new StingException("Record (" + rod + ") in " + name + " has an invalid value for " + HAPLOTYPE_STRAND_COLUMN + ". This value is: \"" + hapStrandValue + "\"");
}
}
Allele vcAlt;
if(alternateAlleles.isEmpty()) {
vcAlt = vc.getReference();
} else {
vcAlt = alternateAlleles.iterator().next();
}
boolean matchFound = false;
for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) {
if(!positiveStrand) {
hapAlt = BaseUtils.simpleReverseComplement(hapAlt);
}
if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) {
matchFound = true;
break;
}
}
if(!matchFound) {
continue; //skip record - none of its alternate alleles match the variant's alternate allele
}
}
}
final String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_REF) );
if(hapRefValue != null && !hapRefValue.equals("*")) {
//match against hapRef
Allele vcRef = vc.getReference();
if(!vcRef.basesMatch(hapRefValue)) {
//TODO check the intersection of vcRef and hapRefValue
continue; //skip record
String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) );
if(hapRefValue != null)
{
hapRefValue = hapRefValue.trim();
if(!hapRefValue.equals("*"))
{
//match against hapolotypeReference.
Allele vcRef = vc.getReference();
if(!vcRef.basesMatch(hapRefValue)) {
//TODO check the intersection of vcRef and hapRefValue
continue; //skip record
}
}
}
//filters passed, so add this record.
List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get( name );
if(listOfMatchingRecords == null) {
listOfMatchingRecords = new LinkedList<Map<String,String>>();
@ -135,7 +198,10 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
final Map<String, String> result = new HashMap<String, String>();
for(final Entry<String, String> entry : rod.entrySet()) {
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
final String value = entry.getValue();
if(!value.trim().isEmpty()) {
result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
}
}
return result;