Fixed handling of records where gene-names are identical (eg. as in refseq NR_030638 in chr20)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3554 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1e42984a16
commit
c1046653a2
|
|
@ -354,25 +354,40 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
|
|||
/**
|
||||
* Verify that gene names, when taken together, provide a unique key for this record
|
||||
* this guarantees that the computeSortKey(..) method will work as expected.
|
||||
* If this is not the case, disambiguate the gene names.
|
||||
*
|
||||
* @param parsedTranscriptRod
|
||||
*/
|
||||
private void checkGeneNamesForUniqueness(TranscriptTableRecord parsedTranscriptRod)
|
||||
{
|
||||
StringBuilder geneNamePortionOfSortKey = new StringBuilder();
|
||||
for(String geneName : parsedTranscriptRod.geneNames) {
|
||||
geneNamePortionOfSortKey.append(geneName);
|
||||
}
|
||||
|
||||
TranscriptTableRecord collisionRecord = keyChecker.put(geneNamePortionOfSortKey.toString(), parsedTranscriptRod);
|
||||
if(collisionRecord != null && new Interval(
|
||||
final String geneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames);
|
||||
final TranscriptTableRecord collisionRecord = keyChecker.get( geneNamePortionOfSortKey );
|
||||
System.err.println("Checking key: " + geneNamePortionOfSortKey + " - got record: " + collisionRecord);
|
||||
if(collisionRecord != null /* && new Interval( - don't allow collitions even if positions are different.
|
||||
collisionRecord.txChrom,
|
||||
(int) collisionRecord.txStart,
|
||||
(int) collisionRecord.txEnd).intersects( new Interval(
|
||||
parsedTranscriptRod.txChrom,
|
||||
(int) parsedTranscriptRod.txStart,
|
||||
(int) parsedTranscriptRod.txEnd))) {
|
||||
throw new RuntimeException("There is a collision between the positions + gene names of the following two records: \n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Since these transcripts overlap and have identical gene names, output data would likely be lost due to collisions in the keys generated by computeSortKey(..)");
|
||||
(int) parsedTranscriptRod.txEnd)) */) {
|
||||
|
||||
//disambiguate the gene names of parsedTranscriptRod
|
||||
int counter = 1;
|
||||
String newGeneNamePortionOfSortKey = null;
|
||||
String[] newGeneNames = null;
|
||||
do {
|
||||
newGeneNames = parsedTranscriptRod.geneNames.clone();
|
||||
newGeneNames[0] += ("." + ++counter); //append '.1' or similar until you find a key that doesn't collide
|
||||
newGeneNamePortionOfSortKey = computeGeneNamePortionOfSortKey(newGeneNames);
|
||||
} while(keyChecker.containsKey(newGeneNamePortionOfSortKey));
|
||||
|
||||
parsedTranscriptRod.geneNames = newGeneNames;
|
||||
|
||||
keyChecker.put(newGeneNamePortionOfSortKey, parsedTranscriptRod );
|
||||
|
||||
logger.warn("WARNING: The gene names of the following two records were identical, so they have been disambiguated:\n 1:" + collisionRecord + "\n 2:" + parsedTranscriptRod +".\n Otherwise, output data would likely be lost due to collisions in the keys generated by computeSortKey(..). ");
|
||||
} else {
|
||||
keyChecker.put(geneNamePortionOfSortKey, parsedTranscriptRod );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -917,12 +932,25 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
|
|||
|
||||
String result = Long.toString(key);
|
||||
if(parsedTranscriptRod != null) {
|
||||
for(String geneName : parsedTranscriptRod.geneNames)
|
||||
result += geneName; //append all gene names, so that the key is unique across all possible splicing variants, etc.
|
||||
result += computeGeneNamePortionOfSortKey(parsedTranscriptRod.geneNames); //append all gene names, so that the key is unique across all possible splicing variants, etc.
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the gene-name portion of the output record sort key.
|
||||
*
|
||||
* @param geneNames
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private String computeGeneNamePortionOfSortKey(String[] geneNames) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for(String geneName : geneNames)
|
||||
result.append(geneName); //append all gene names, so that the key is unique across all possible splicing variants, etc.
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the file to the destination directory.
|
||||
*
|
||||
|
|
@ -1007,7 +1035,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
|
|||
public int[] exonEnds;
|
||||
//public int[] exonFrames; - not used for anything, frame is computed another way
|
||||
|
||||
private AnnotatorInputTableFeature rod;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -1016,7 +1044,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
|
|||
* @param transcriptRod A rod representing a single record in the transcript table.
|
||||
*/
|
||||
public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) {
|
||||
this.rod = transcriptRod;
|
||||
|
||||
//String binStr = transcriptRod.get("bin");
|
||||
//String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
|
||||
String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN);
|
||||
|
|
@ -1150,7 +1178,9 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return rod.toString();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue