From c0f4695902328ba778255a3015735187d2c8f600 Mon Sep 17 00:00:00 2001 From: weisburd Date: Wed, 14 Apr 2010 14:42:19 +0000 Subject: [PATCH] Improved handling of haplotypeReference and haplotypeAlternate columns. Added haplotypeStrand column. Improved handling of empty fields in data files. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3166 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/annotator/GenomicAnnotation.java | 134 +++++++++++++----- 1 file changed, 100 insertions(+), 34 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java index c605ec19c..6fdf7ad97 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java @@ -12,11 +12,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.AnnotatorROD; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.TabularROD; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; @@ -24,40 +27,53 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; /** - * TODO comment + * This plugin for {@link VariantAnnotatorEngine} serves as the core + * of the {@link GenomicAnnotator}. It finds all records in the -B input files + * that match the given variant's position and, optionally, it's reference and alternate alleles. + * Whether or not matching is done by reference and alternate alleles for a particular input file + * based solely on whether the given -B input has columns named "haplotypeReference" and + * "haplotypeAlternate". */ public class GenomicAnnotation implements InfoFieldAnnotation { - private static final String HAP_REF = "hap_ref"; - private static final String HAP_ALT = "hap_alt"; + private static final String HAPLOTYPE_REFERENCE_COLUMN = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN; + private static final String HAPLOTYPE_ALTERNATE_COLUMN = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN; + private static final String HAPLOTYPE_STRAND_COLUMN = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN; /** * For each ROD (aka. record) which overlaps the current locus, generates a * set of annotations of the form: * - * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A), + * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue * - * These annotations are stored in a Map. + * Examples of generated annotations are: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. * + * @return The following is an explanation of this method's return value: + * The annotations (aka. columnName=fieldValue pairs) from a matching record in a particular file are stored in a Map. * Since a single input file can have multiple records that overlap the current * locus (eg. dbSNP can have multiple entries for the same location), a different - * Map is created for each of these, resulting in a List> - * for each input file. + * Map is created for each matching record in a particular file. + * The list of matching records for each file is then represented as a List> * - * The return value of this method is a Map of the form: + * The return value of this method is a Map of the form: * rodName1 -> List> * rodName2 -> List> * rodName3 -> List> * ... + * Where the rodNames are the -B binding names for each file that were specified on the command line. * - * The List values are guaranteed to have size > 0, and in most cases will have size == 1. + * NOTE: The lists (List>) are guaranteed to have size > 0. + * The reason is that a rodName -> List> entry will only + * be created in Map if the List has at least one element. */ public Map annotate(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc) { + //iterate over each record that overlaps the current locus, and, if it passes certain filters, + //add its values to the list of annotations for this locus. final Map annotations = new HashMap(); for(final GATKFeature gatkFeature : tracker.getAllRods()) { @@ -68,43 +84,90 @@ public class GenomicAnnotation implements InfoFieldAnnotation { } if( ! (rod instanceof TabularROD) ) { - continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns + continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns. } TabularROD tabularRod = (TabularROD) rod; final Map annotationsForRecord = convertRecordToAnnotations( tabularRod ); - //filter by matching the vc's reference allele and alternate allele to the record's HAP_REF and HAP_ALT columns. - final String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_ALT) ); - if(hapAltValue != null && !hapAltValue.equals("*")) { - Set alternateAlleles = vc.getAlternateAlleles(); - if(alternateAlleles.isEmpty()) { - //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right? - continue; - } else if(alternateAlleles.size() > 1) { - throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele."); - } + //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the + //alleles specified match the the variant's reference allele and alternate allele. + //If they don't match, this record will be skipped, and its values will not be used for annotations. + // + //If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match. + //Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two + //are exactly equal (case-insensitive). - //TODO handle strand somehow? + //The HAPLOTYPE_REFERENCE_COLUMN is matches the variant's reference allele based on a case-insensitive string comparison. + //The HAPLOTYPE_ALTERNATE_COLUMN is can optionally list more than allele separated by one of these chars: ,\/:| + //The matches if any of the + String hapAltValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) ); + if(hapAltValue != null) + { + if(!hapAltValue.equals("*")) + { + Set alternateAlleles = vc.getAlternateAlleles(); + //if(alternateAlleles.isEmpty()) { + //handle a site that has been called monomorphic reference + //alternateAlleles.add(vc.getReference()); + //continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right? + //} else + if(alternateAlleles.size() > 1) { + throw new StingException("Record (" + rod + ") in " + name + " contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele."); + } - //decide whether to skip this record. - Allele vcAlt = alternateAlleles.iterator().next(); - if(!vcAlt.basesMatch(hapAltValue)) { - //TODO check the intersection of vcAlt and hapAltValue - continue; //skip record + boolean positiveStrand = true; //if HAPLOTYPE_STRAND_COLUMN isn't specified, assume positive strand. + String hapStrandValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_STRAND_COLUMN) ); + if(hapStrandValue != null ) { + hapStrandValue = hapStrandValue.trim().toLowerCase(); + if(hapStrandValue.equals("-") || hapStrandValue.equals("r")) { + positiveStrand = false; + } else if(!hapStrandValue.equals("+") && !hapStrandValue.equals("f")) { + throw new StingException("Record (" + rod + ") in " + name + " has an invalid value for " + HAPLOTYPE_STRAND_COLUMN + ". This value is: \"" + hapStrandValue + "\""); + } + } + + + Allele vcAlt; + if(alternateAlleles.isEmpty()) { + vcAlt = vc.getReference(); + } else { + vcAlt = alternateAlleles.iterator().next(); + } + + boolean matchFound = false; + for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) { + if(!positiveStrand) { + hapAlt = BaseUtils.simpleReverseComplement(hapAlt); + } + + if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) { + matchFound = true; + break; + } + } + if(!matchFound) { + continue; //skip record - none of its alternate alleles match the variant's alternate allele + } } } - final String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAP_REF) ); - if(hapRefValue != null && !hapRefValue.equals("*")) { - //match against hapRef - Allele vcRef = vc.getReference(); - if(!vcRef.basesMatch(hapRefValue)) { - //TODO check the intersection of vcRef and hapRefValue - continue; //skip record + String hapRefValue = annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) ); + if(hapRefValue != null) + { + hapRefValue = hapRefValue.trim(); + if(!hapRefValue.equals("*")) + { + //match against hapolotypeReference. + Allele vcRef = vc.getReference(); + if(!vcRef.basesMatch(hapRefValue)) { + //TODO check the intersection of vcRef and hapRefValue + continue; //skip record + } } } + //filters passed, so add this record. List> listOfMatchingRecords = (List>) annotations.get( name ); if(listOfMatchingRecords == null) { listOfMatchingRecords = new LinkedList>(); @@ -135,7 +198,10 @@ public class GenomicAnnotation implements InfoFieldAnnotation { final Map result = new HashMap(); for(final Entry entry : rod.entrySet()) { - result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue()); + final String value = entry.getValue(); + if(!value.trim().isEmpty()) { + result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue()); + } } return result;