diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java index 37d5048cc..1b1423169 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java @@ -354,25 +354,40 @@ public class TranscriptToInfo extends RodWalker, TreeMap /** * Verify that gene names, when taken together, provide a unique key for this record * this guarantees that the computeSortKey(..) method will work as expected. + * If this is not the case, disambiguate the gene names. * * @param parsedTranscriptRod */ private void checkGeneNamesForUniqueness(TranscriptTableRecord parsedTranscriptRod) { - StringBuilder geneNamePortionOfSortKey = new StringBuilder(); - for(String geneName : parsedTranscriptRod.geneNames) { - geneNamePortionOfSortKey.append(geneName); - } - - TranscriptTableRecord collisionRecord = keyChecker.put(geneNamePortionOfSortKey.toString(), parsedTranscriptRod); - if(collisionRecord != null && new Interval( + final String geneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames); + final TranscriptTableRecord collisionRecord = keyChecker.get( geneNamePortionOfSortKey ); + System.err.println("Checking key: " + geneNamePortionOfSortKey + " - got record: " + collisionRecord); + if(collisionRecord != null /* && new Interval( - don't allow collitions even if positions are different. collisionRecord.txChrom, (int) collisionRecord.txStart, (int) collisionRecord.txEnd).intersects( new Interval( parsedTranscriptRod.txChrom, (int) parsedTranscriptRod.txStart, - (int) parsedTranscriptRod.txEnd))) { - throw new RuntimeException("There is a collision between the positions + gene names of the following two records: \n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Since these transcripts overlap and have identical gene names, output data would likely be lost due to collisions in the keys generated by computeSortKey(..)"); + (int) parsedTranscriptRod.txEnd)) */) { + + //disambiguate the gene names of parsedTranscriptRod + int counter = 1; + String newGeneNamePortionOfSortKey = null; + String[] newGeneNames = null; + do { + newGeneNames = parsedTranscriptRod.geneNames.clone(); + newGeneNames[0] += ("." + ++counter); //append '.1' or similar until you find a key that doesn't collide + newGeneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(newGeneNames); + } while(keyChecker.containsKey(newGeneNamePortionOfSortKey)); + + parsedTranscriptRod.geneNames = newGeneNames; + + keyChecker.put(newGeneNamePortionOfSortKey, parsedTranscriptRod ); + + logger.warn("WARNING: The gene names of the following two records were identical, so they have been disambiguated:\n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Otherwise, output data would likely be lost due to collisions in the keys generated by computeSortKey(..). "); + } else { + keyChecker.put(geneNamePortionOfSortKey, parsedTranscriptRod ); } } @@ -917,12 +932,25 @@ public class TranscriptToInfo extends RodWalker, TreeMap String result = Long.toString(key); if(parsedTranscriptRod != null) { - for(String geneName : parsedTranscriptRod.geneNames) - result += geneName; //append all gene names, so that the key is unique across all possible splicing variants, etc. + result += computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames); //append all gene names, so that the key is unique across all possible splicing variants, etc. } return result; } + /** + * Computes the gene-name portion of the output record sort key. + * + * @param geneNames + * + * @return + */ + private String computeGeneNamePortionOfSortKey(String[] geneNames) { + StringBuilder result = new StringBuilder(); + for(String geneName : geneNames) + result.append(geneName); //append all gene names, so that the key is unique across all possible splicing variants, etc. + return result.toString(); + } + /** * Moves the file to the destination directory. * @@ -1007,7 +1035,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap public int[] exonEnds; //public int[] exonFrames; - not used for anything, frame is computed another way - private AnnotatorInputTableFeature rod; + /** @@ -1016,7 +1044,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap * @param transcriptRod A rod representing a single record in the transcript table. */ public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) { - this.rod = transcriptRod; + //String binStr = transcriptRod.get("bin"); //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN); @@ -1150,7 +1178,9 @@ public class TranscriptToInfo extends RodWalker, TreeMap @Override public String toString() { - return rod.toString(); + StringBuilder sb = new StringBuilder(); + sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds)); + return sb.toString(); }