Refactored some of the small utility methods

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3475 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-02 16:26:00 +00:00 · 2010-06-02 16:26:00 +00:00 · e14ae471a0
parent 898a78e97d
commit e14ae471a0
1 changed files with 43 additions and 58 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
@ -26,7 +26,6 @@
 package org.broadinstitute.sting.playground.gatk.walkers.annotator;

 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@ -60,31 +59,39 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
    public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate";
    public static final String HAPLOTYPE_STRAND_COLUMN = "haplotypeStrand";

+    public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords";
+
+    /** Characters that aren't allowed within VCF info field key-value pairs */
+    public static final char[] ILLEGAL_INFO_FIELD_VALUES            = {  ' ', '=', ';' };
+    /** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */
+    public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = {  '_', '-',  '!'  };

    /**
-     * For each ROD (aka. record) which overlaps the current locus, generates a
+     * For each -B input file, for each record which overlaps the current locus, generates a
     * set of annotations of the form:
     *
-     * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
+     * bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc.
     *
-     * Examples of generated annotations are: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc.
+     * For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc.
     *
     * @return The following is an explanation of this method's return value:
-     * The annotations (aka. columnName=fieldValue pairs) from a matching record in a particular file are stored in a Map<String, String>.
+     *
+     * The annotations from a matching in a particular file are stored in a Map<String, String>
+     * where the key is bindingName.columnName and the value is the columnValue.
     * Since a single input file can have multiple records that overlap the current
-     * locus (eg. dbSNP can have multiple entries for the same location), a different
+     * locus (eg. dbSNP can have multiple entries for the same genomic position), a different
     * Map<String, String> is created for each matching record in a particular file.
-     * The list of matching records for each file is then represented as a List<Map<String, String>>
+     * The set of matching records for each file is then represented as a List<Map<String, String>>
     *
     * The return value of this method is a Map<String, Object> of the form:
     *     rodName1 -> List<Map<String, String>>
     *     rodName2 -> List<Map<String, String>>
     *     rodName3 -> List<Map<String, String>>
     *     ...
-     * Where the rodNames are the -B binding names for each file that were specified on the command line.
+     * Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file).
     *
-     * NOTE: The lists (List<Map<String, String>>) are guaranteed to have size > 0.
-     * The reason is that a rodName -> List<Map<String, String>> entry will only
+     * NOTE: The lists (List<Map<String, String>>) are guaranteed to have size > 0
+     * because a  rodName -> List<Map<String, String>>  entry will only
     * be created in Map<String, Object> if the List has at least one element.
     */
    public Map<String, Object> annotate(final RefMetaDataTracker tracker,
@ -106,7 +113,7 @@ public class GenomicAnnotation implements InfoFieldAnnotation {
                continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns.
            }

-            final Map<String, String> annotationsForRecord = convertRecordToAnnotations( gatkFeature );
+            final Map<String, String> annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues());

            //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the
            //alleles specified match the the variant's reference allele and alternate allele.
@ -202,77 +209,55 @@ public class GenomicAnnotation implements InfoFieldAnnotation {


    /**
-     * Converts the ROD to a set of key-value pairs of the form:
-     *   thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
+     * Converts the given record to a set of key-value pairs of the form:
+     *   bindingName.columnName1=column1Value, bindingName.columnName2=column2Value
     *   (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
     *
-     * @param annotatorInputTableFeature AnnotatorInputTableFeature corresponding to one record in one input file.
+     * @param annotatorInputTableFeature AnnotatorInputTableFeature corresponding to one record in one -B input file.
     * @param name The binding name of the given AnnotatorInputTableFeature.
-     * @return The map of column-name -> value pairs.
+     * @return The map of columnName -> columnValue pairs.
     */
-    private Map<String, String> convertRecordToAnnotations( final GATKFeature feature) {
-
+    public static Map<String, String> convertRecordToAnnotations( String bindingName, Map<String, String> record) {
        final Map<String, String> result = new HashMap<String, String>();
-        for(final Entry<String, String> entry : ((AnnotatorInputTableFeature) feature.getUnderlyingObject()).getEntrySet()) {
+
+        for(final Entry<String, String> entry : record.entrySet()) {
            final String value = entry.getValue();
            if(!value.trim().isEmpty()) {
-                result.put( generateInfoFieldKey(feature.getName(), entry.getKey()), entry.getValue());
+                result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue()));
            }
        }

        return result;
    }

+    /**
+     * Combines the 2 values into a full key.
+     * @param rodBindingName
+     * @param columnName
+     * @return
+     */
    public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
-        return rodBindingName + "." + columnName;
+        return rodBindingName + '.' + columnName;
    }


+
    /**
-     * Parses the columns arg and returns a Map of columns hashed by their binding name.
-     * For example:
-     *   The command line:
-     *      -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd
+     * Replaces any characters that are not allowed in the info field of a VCF file.
     *
-     *   will be passed to this method as:
-     *       ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"]
-     *
-     *   resulting in a return value of:
-     *      {
-     *       "dbSnp" -> "dbSnp.valid" ,
-     *       "dbSnp" -> "dbsnp.avHet" ,
-     *       "refGene" -> "refGene.txStart",
-     *       "refGene" -> "refGene.txEnd"
-     *      }
-     *
-     * @param columnsArg The -s command line arg value.
-     *
-     * @return Map representing a parsed version of this arg - see above.
+     * @param value
+     * @return the value with any illegal characters replaced by legal ones.
     */
-    public static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
-        Map<String, Set<String>> result = new HashMap<String, Set<String>>();
-
-        for(String s : columnsArg) {
-            for(String columnSpecifier : s.split(",") ) {
-                String[] rodNameColumnName = columnSpecifier.split("\\.");
-                if(rodNameColumnName.length != 2) {
-                    throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'.");
-                }
-                String rodName = rodNameColumnName[0];
-                //String columnName = rodNameColumnName[1];
-
-                Set<String> requestedColumns = result.get(rodName);
-                if(requestedColumns == null) {
-                    requestedColumns = new HashSet<String>();
-                    result.put(rodName, requestedColumns);
-                }
-                requestedColumns.add(columnSpecifier);
-            }
+    private static String scrubInfoFieldValue(String value) {
+        for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) {
+            value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]);
        }

-        return result;
+        return value;
    }

+
+
    public VCFInfoHeaderLine getDescription() {
        return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. ");
    }