Fixed handling of records where gene-names are identical (eg. as in refseq NR_030638 in chr20)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3554 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
weisburd 2010-06-14 20:00:49 +00:00
parent 1e42984a16
commit c1046653a2
1 changed files with 44 additions and 14 deletions

View File

@ -354,25 +354,40 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
/**
* Verify that gene names, when taken together, provide a unique key for this record
* this guarantees that the computeSortKey(..) method will work as expected.
* If this is not the case, disambiguate the gene names.
*
* @param parsedTranscriptRod
*/
private void checkGeneNamesForUniqueness(TranscriptTableRecord parsedTranscriptRod)
{
StringBuilder geneNamePortionOfSortKey = new StringBuilder();
for(String geneName : parsedTranscriptRod.geneNames) {
geneNamePortionOfSortKey.append(geneName);
}
TranscriptTableRecord collisionRecord = keyChecker.put(geneNamePortionOfSortKey.toString(), parsedTranscriptRod);
if(collisionRecord != null && new Interval(
final String geneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames);
final TranscriptTableRecord collisionRecord = keyChecker.get( geneNamePortionOfSortKey );
System.err.println("Checking key: " + geneNamePortionOfSortKey + " - got record: " + collisionRecord);
if(collisionRecord != null /* && new Interval( - don't allow collitions even if positions are different.
collisionRecord.txChrom,
(int) collisionRecord.txStart,
(int) collisionRecord.txEnd).intersects( new Interval(
parsedTranscriptRod.txChrom,
(int) parsedTranscriptRod.txStart,
(int) parsedTranscriptRod.txEnd))) {
throw new RuntimeException("There is a collision between the positions + gene names of the following two records: \n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Since these transcripts overlap and have identical gene names, output data would likely be lost due to collisions in the keys generated by computeSortKey(..)");
(int) parsedTranscriptRod.txEnd)) */) {
//disambiguate the gene names of parsedTranscriptRod
int counter = 1;
String newGeneNamePortionOfSortKey = null;
String[] newGeneNames = null;
do {
newGeneNames = parsedTranscriptRod.geneNames.clone();
newGeneNames[0] += ("." + ++counter); //append '.1' or similar until you find a key that doesn't collide
newGeneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(newGeneNames);
} while(keyChecker.containsKey(newGeneNamePortionOfSortKey));
parsedTranscriptRod.geneNames = newGeneNames;
keyChecker.put(newGeneNamePortionOfSortKey, parsedTranscriptRod );
logger.warn("WARNING: The gene names of the following two records were identical, so they have been disambiguated:\n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Otherwise, output data would likely be lost due to collisions in the keys generated by computeSortKey(..). ");
} else {
keyChecker.put(geneNamePortionOfSortKey, parsedTranscriptRod );
}
}
@ -917,12 +932,25 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
String result = Long.toString(key);
if(parsedTranscriptRod != null) {
for(String geneName : parsedTranscriptRod.geneNames)
result += geneName; //append all gene names, so that the key is unique across all possible splicing variants, etc.
result += computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames); //append all gene names, so that the key is unique across all possible splicing variants, etc.
}
return result;
}
/**
* Computes the gene-name portion of the output record sort key.
*
* @param geneNames
*
* @return
*/
private String computeGeneNamePortionOfSortKey(String[] geneNames) {
StringBuilder result = new StringBuilder();
for(String geneName : geneNames)
result.append(geneName); //append all gene names, so that the key is unique across all possible splicing variants, etc.
return result.toString();
}
/**
* Moves the file to the destination directory.
*
@ -1007,7 +1035,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
public int[] exonEnds;
//public int[] exonFrames; - not used for anything, frame is computed another way
private AnnotatorInputTableFeature rod;
/**
@ -1016,7 +1044,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* @param transcriptRod A rod representing a single record in the transcript table.
*/
public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) {
this.rod = transcriptRod;
//String binStr = transcriptRod.get("bin");
//String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN);
@ -1150,7 +1178,9 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
@Override
public String toString() {
return rod.toString();
StringBuilder sb = new StringBuilder();
sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds));
return sb.toString();
}