diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java index 5cfd55d1d..2c1b5bad6 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.annotator; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -60,31 +59,39 @@ public class GenomicAnnotation implements InfoFieldAnnotation { public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate"; public static final String HAPLOTYPE_STRAND_COLUMN = "haplotypeStrand"; + public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords"; + + /** Characters that aren't allowed within VCF info field key-value pairs */ + public static final char[] ILLEGAL_INFO_FIELD_VALUES = { ' ', '=', ';' }; + /** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */ + public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = { '_', '-', '!' }; /** - * For each ROD (aka. record) which overlaps the current locus, generates a + * For each -B input file, for each record which overlaps the current locus, generates a * set of annotations of the form: * - * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue + * bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc. * - * Examples of generated annotations are: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. + * For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. * * @return The following is an explanation of this method's return value: - * The annotations (aka. columnName=fieldValue pairs) from a matching record in a particular file are stored in a Map. + * + * The annotations from a matching in a particular file are stored in a Map + * where the key is bindingName.columnName and the value is the columnValue. * Since a single input file can have multiple records that overlap the current - * locus (eg. dbSNP can have multiple entries for the same location), a different + * locus (eg. dbSNP can have multiple entries for the same genomic position), a different * Map is created for each matching record in a particular file. - * The list of matching records for each file is then represented as a List> + * The set of matching records for each file is then represented as a List> * * The return value of this method is a Map of the form: * rodName1 -> List> * rodName2 -> List> * rodName3 -> List> * ... - * Where the rodNames are the -B binding names for each file that were specified on the command line. + * Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file). * - * NOTE: The lists (List>) are guaranteed to have size > 0. - * The reason is that a rodName -> List> entry will only + * NOTE: The lists (List>) are guaranteed to have size > 0 + * because a rodName -> List> entry will only * be created in Map if the List has at least one element. */ public Map annotate(final RefMetaDataTracker tracker, @@ -106,7 +113,7 @@ public class GenomicAnnotation implements InfoFieldAnnotation { continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns. } - final Map annotationsForRecord = convertRecordToAnnotations( gatkFeature ); + final Map annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues()); //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the //alleles specified match the the variant's reference allele and alternate allele. @@ -202,77 +209,55 @@ public class GenomicAnnotation implements InfoFieldAnnotation { /** - * Converts the ROD to a set of key-value pairs of the form: - * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue + * Converts the given record to a set of key-value pairs of the form: + * bindingName.columnName1=column1Value, bindingName.columnName2=column2Value * (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A) * - * @param annotatorInputTableFeature AnnotatorInputTableFeature corresponding to one record in one input file. + * @param annotatorInputTableFeature AnnotatorInputTableFeature corresponding to one record in one -B input file. * @param name The binding name of the given AnnotatorInputTableFeature. - * @return The map of column-name -> value pairs. + * @return The map of columnName -> columnValue pairs. */ - private Map convertRecordToAnnotations( final GATKFeature feature) { - + public static Map convertRecordToAnnotations( String bindingName, Map record) { final Map result = new HashMap(); - for(final Entry entry : ((AnnotatorInputTableFeature) feature.getUnderlyingObject()).getEntrySet()) { + + for(final Entry entry : record.entrySet()) { final String value = entry.getValue(); if(!value.trim().isEmpty()) { - result.put( generateInfoFieldKey(feature.getName(), entry.getKey()), entry.getValue()); + result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue())); } } return result; } + /** + * Combines the 2 values into a full key. + * @param rodBindingName + * @param columnName + * @return + */ public static String generateInfoFieldKey(String rodBindingName, String columnName ) { - return rodBindingName + "." + columnName; + return rodBindingName + '.' + columnName; } + /** - * Parses the columns arg and returns a Map of columns hashed by their binding name. - * For example: - * The command line: - * -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd + * Replaces any characters that are not allowed in the info field of a VCF file. * - * will be passed to this method as: - * ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"] - * - * resulting in a return value of: - * { - * "dbSnp" -> "dbSnp.valid" , - * "dbSnp" -> "dbsnp.avHet" , - * "refGene" -> "refGene.txStart", - * "refGene" -> "refGene.txEnd" - * } - * - * @param columnsArg The -s command line arg value. - * - * @return Map representing a parsed version of this arg - see above. + * @param value + * @return the value with any illegal characters replaced by legal ones. */ - public static Map> parseColumnsArg(String[] columnsArg) { - Map> result = new HashMap>(); - - for(String s : columnsArg) { - for(String columnSpecifier : s.split(",") ) { - String[] rodNameColumnName = columnSpecifier.split("\\."); - if(rodNameColumnName.length != 2) { - throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'."); - } - String rodName = rodNameColumnName[0]; - //String columnName = rodNameColumnName[1]; - - Set requestedColumns = result.get(rodName); - if(requestedColumns == null) { - requestedColumns = new HashSet(); - result.put(rodName, requestedColumns); - } - requestedColumns.add(columnSpecifier); - } + private static String scrubInfoFieldValue(String value) { + for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) { + value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]); } - return result; + return value; } + + public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. "); }