Fixed handling of records where gene-names are identical (eg. as in refseq NR_030638 in chr20)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3554 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-14 20:00:49 +00:00 · 2010-06-14 20:00:49 +00:00 · c1046653a2
parent 1e42984a16
commit c1046653a2
1 changed files with 44 additions and 14 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
@ -354,25 +354,40 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
    /**
     * Verify that gene names, when taken together, provide a unique key for this record
     * this guarantees that the computeSortKey(..) method will work as expected.
+     * If this is not the case, disambiguate the gene names.
     *
     * @param parsedTranscriptRod
     */
    private void checkGeneNamesForUniqueness(TranscriptTableRecord parsedTranscriptRod)
    {
-        StringBuilder geneNamePortionOfSortKey = new StringBuilder();
-        for(String geneName : parsedTranscriptRod.geneNames) {
-            geneNamePortionOfSortKey.append(geneName);
-        }
-
-        TranscriptTableRecord collisionRecord = keyChecker.put(geneNamePortionOfSortKey.toString(), parsedTranscriptRod);
-        if(collisionRecord != null && new Interval(
+        final String geneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames);
+        final TranscriptTableRecord collisionRecord = keyChecker.get( geneNamePortionOfSortKey );
+        System.err.println("Checking key: " + geneNamePortionOfSortKey + " - got record: " + collisionRecord);
+        if(collisionRecord != null /* && new Interval(     - don't allow collitions even if positions are different.
                collisionRecord.txChrom,
                (int) collisionRecord.txStart,
                (int) collisionRecord.txEnd).intersects( new Interval(
                        parsedTranscriptRod.txChrom,
                        (int) parsedTranscriptRod.txStart,
-                        (int) parsedTranscriptRod.txEnd))) {
-            throw new RuntimeException("There is a collision between the positions + gene names of the following two records: \n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Since these transcripts overlap and have identical gene names, output data would likely be lost due to collisions in the keys generated by computeSortKey(..)");
+                        (int) parsedTranscriptRod.txEnd)) */) {
+
+            //disambiguate the gene names of parsedTranscriptRod
+            int counter = 1;
+            String newGeneNamePortionOfSortKey = null;
+            String[] newGeneNames = null;
+            do {
+                newGeneNames = parsedTranscriptRod.geneNames.clone();
+                newGeneNames[0] += ("." +  ++counter); //append '.1' or similar until you find a key that doesn't collide
+                newGeneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(newGeneNames);
+            } while(keyChecker.containsKey(newGeneNamePortionOfSortKey));
+
+            parsedTranscriptRod.geneNames = newGeneNames;
+
+            keyChecker.put(newGeneNamePortionOfSortKey, parsedTranscriptRod );
+
+            logger.warn("WARNING: The gene names of the following two records were identical, so they have been disambiguated:\n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Otherwise, output data would likely be lost due to collisions in the keys generated by computeSortKey(..). ");
+        } else {
+            keyChecker.put(geneNamePortionOfSortKey, parsedTranscriptRod );
        }
    }

@ -917,12 +932,25 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap

        String result = Long.toString(key);
        if(parsedTranscriptRod != null) {
-            for(String geneName : parsedTranscriptRod.geneNames)
-            result += geneName;  //append all gene names, so that the key is unique across all possible splicing variants, etc.
+            result += computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames);  //append all gene names, so that the key is unique across all possible splicing variants, etc.
        }
        return result;
    }

+    /**
+     * Computes the gene-name portion of the output record sort key.
+     *
+     * @param geneNames
+     *
+     * @return
+     */
+    private String computeGeneNamePortionOfSortKey(String[] geneNames) {
+        StringBuilder result = new StringBuilder();
+        for(String geneName : geneNames)
+            result.append(geneName);  //append all gene names, so that the key is unique across all possible splicing variants, etc.
+        return result.toString();
+    }
+
    /**
     * Moves the file to the destination directory.
     *
@ -1007,7 +1035,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
        public int[] exonEnds;
        //public int[] exonFrames; - not used for anything, frame is computed another way

-        private AnnotatorInputTableFeature rod;
+


        /**
@ -1016,7 +1044,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
         * @param transcriptRod A rod representing a single record in the transcript table.
         */
        public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) {
-            this.rod = transcriptRod;
+
            //String binStr = transcriptRod.get("bin");
            //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
            String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN);
@ -1150,7 +1178,9 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap

        @Override
        public String toString() {
-            return rod.toString();
+            StringBuilder sb = new StringBuilder();
+            sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds));
+            return sb.toString();
        }