1st attempt to implement extra columns
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3249 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a72a5a7b1a
commit
10bcd72593
|
|
@ -80,7 +80,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
public static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript
|
public static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript
|
||||||
public static final String OUTPUT_CODING_COORD = "codingCoord"; //1-based offset within the cds region
|
public static final String OUTPUT_CODING_COORD = "codingCoord"; //1-based offset within the cds region
|
||||||
|
|
||||||
public static final String OUTPUT_SPLICE_DISTANCE = "minSpliceDistance"; //eg. 40
|
public static final String OUTPUT_SPLICE_DISTANCE = "spliceDist"; //eg. integer, bp to nearest exon/intron boundary
|
||||||
|
|
||||||
public static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20
|
public static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20
|
||||||
public static final String OUTPUT_REFERENCE_CODON = "referenceCodon";
|
public static final String OUTPUT_REFERENCE_CODON = "referenceCodon";
|
||||||
|
|
@ -90,9 +90,14 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
|
||||||
public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
|
public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
|
||||||
|
|
||||||
public static final String OUTPUT_CODING_COORD_STR = "coding_coord_str";
|
public static final String OUTPUT_CODING_COORD_STR = "codingCoordStr";
|
||||||
public static final String OUTPUT_PROTEIN_COORD_STR = "protein_coord_str";
|
public static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr";
|
||||||
public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
|
|
||||||
|
//public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
|
||||||
|
public static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary
|
||||||
|
public static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
|
public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
|
||||||
OUTPUT_SORT_KEY,
|
OUTPUT_SORT_KEY,
|
||||||
|
|
@ -122,9 +127,10 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
|
||||||
OUTPUT_CODING_COORD_STR,
|
OUTPUT_CODING_COORD_STR,
|
||||||
OUTPUT_PROTEIN_COORD_STR,
|
OUTPUT_PROTEIN_COORD_STR,
|
||||||
OUTPUT_INTRON_COORD_STR,
|
|
||||||
|
|
||||||
OUTPUT_IN_CODING_REGION,
|
OUTPUT_IN_CODING_REGION,
|
||||||
|
OUTPUT_SPLICE_INFO,
|
||||||
|
OUTPUT_UORF_CHANGE,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -258,16 +264,21 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes the distance to the nearest splice-site.
|
* Computes the distance to the nearest splice-site.
|
||||||
* Returns 1 when down-stream of splice-juction, -1 when upstream
|
* The returned value is negative its on the 5' side (eg. upstream) of the juntion, and
|
||||||
|
* positive if its on the 3' side.
|
||||||
*/
|
*/
|
||||||
public int distanceToSpliceSite(final long genomPosition) {
|
public int computeDistanceToNearestSpliceSite(final long genomPosition) {
|
||||||
int prevDistance = Integer.MAX_VALUE;
|
int prevDistance = Integer.MAX_VALUE;
|
||||||
int curDistance;
|
|
||||||
for(int i = 0; i < exonStarts.length; i++) {
|
for(int i = 0; i < exonStarts.length; i++) {
|
||||||
final long curStart = exonStarts[i];
|
final long curStart = exonStarts[i];
|
||||||
curDistance = (int) (curStart-genomPosition);
|
int curDistance = (int) (curStart-genomPosition);
|
||||||
if(genomPosition < curStart) {
|
if(genomPosition < curStart) {
|
||||||
return Math.min(prevDistance, curDistance);
|
//position is within the current intron
|
||||||
|
if(prevDistance < curDistance) {
|
||||||
|
return positiveStrand ? prevDistance : -prevDistance;
|
||||||
|
} else {
|
||||||
|
return positiveStrand ? -curDistance : curDistance;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
prevDistance = (int) (genomPosition - curStart) + 1;
|
prevDistance = (int) (genomPosition - curStart) + 1;
|
||||||
}
|
}
|
||||||
|
|
@ -275,13 +286,21 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
final long curStop = exonEnds[i];
|
final long curStop = exonEnds[i];
|
||||||
curDistance = (int) (curStop-genomPosition) + 1;
|
curDistance = (int) (curStop-genomPosition) + 1;
|
||||||
if(genomPosition <= curStop) {
|
if(genomPosition <= curStop) {
|
||||||
return Math.min(prevDistance, curDistance);
|
//position is within an exon
|
||||||
|
if(prevDistance < curDistance) {
|
||||||
|
return positiveStrand ? prevDistance : -prevDistance;
|
||||||
|
} else {
|
||||||
|
return positiveStrand ? -curDistance : curDistance;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
prevDistance = (int) (genomPosition - curStop);
|
prevDistance = (int) (genomPosition - curStop);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return prevDistance; //out of exons. return genomPosition-curStop
|
throw new IllegalArgumentException("Genomic position: [" + genomPosition +"] not found within transcript: " + this +". " +
|
||||||
|
"This method should not have been called for this position. NOTE: this method assumes that all transcripts start " +
|
||||||
|
"with an exon and end with an exon (rather than an intron). Is this wrong?");
|
||||||
|
//return prevDistance; //out of exons. return genomPosition-curStop
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -298,6 +317,66 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
return rod.toString();
|
return rod.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the coding coord of the 1st nucleotide in the transcript.
|
||||||
|
* If the 1st nucleotide is in the 5'utr, the returned value will be negative.
|
||||||
|
* Otherwise (if the 1st nucleotide is CDS), the returned value is 1.
|
||||||
|
*/
|
||||||
|
public int computeInitialCodingCoord() {
|
||||||
|
if(!isProteinCodingTranscript()) {
|
||||||
|
throw new StingException("This method should only be called for protein-coding transcripts");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(positiveStrand)
|
||||||
|
{
|
||||||
|
if( cdsStart == exonStarts[0] ) {
|
||||||
|
//the 1st nucleotide of the transcript is CDS.
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int result = 0;
|
||||||
|
for(int i = 0; i < exonStarts.length; i++)
|
||||||
|
{
|
||||||
|
final long exonStart = exonStarts[i];
|
||||||
|
final long exonEnd = exonEnds[i];
|
||||||
|
if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart
|
||||||
|
//this means cdsStart is within the current exon
|
||||||
|
result += (cdsStart - exonStart) + 1;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
//cdsStart is downstream of the current exon
|
||||||
|
result += (exonEnd - exonStart) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -result; //negate because 5' UTR coding coord is negative
|
||||||
|
}
|
||||||
|
else //(negative strand)
|
||||||
|
{
|
||||||
|
final long cdsStart_5prime = cdsEnd;
|
||||||
|
if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) {
|
||||||
|
//the 1st nucleotide of the transcript is CDS.
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int result = 0;
|
||||||
|
for(int i = exonEnds.length - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
final long exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i]
|
||||||
|
final long exonEnd = exonStarts[i];
|
||||||
|
if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart
|
||||||
|
//this means cdsStart is within the current exon
|
||||||
|
result += -(cdsStart_5prime - exonStart) + 1;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
//cdsStart is downstream of the current exon
|
||||||
|
result += -(exonEnd - exonStart) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -result; //negate because 5' UTR coding coord is negative
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -362,7 +441,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
Collection<RODRecordList> rods = tracker.getBoundRodTracks();
|
final Collection<RODRecordList> rods = tracker.getBoundRodTracks();
|
||||||
if(rods.size() == 0) {
|
if(rods.size() == 0) {
|
||||||
//if there's nothing overlapping this locus, skip it.
|
//if there's nothing overlapping this locus, skip it.
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -378,7 +457,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
throw new RuntimeException("The 'refgene' -B arg must have the type: 'AnnotatorInputTable'.");
|
throw new RuntimeException("The 'refgene' -B arg must have the type: 'AnnotatorInputTable'.");
|
||||||
}
|
}
|
||||||
|
|
||||||
AnnotatorROD refGeneRod = (AnnotatorROD) refGeneRodObject;
|
final AnnotatorROD refGeneRod = (AnnotatorROD) refGeneRodObject;
|
||||||
RefGeneTranscriptRecord parsedRefGeneRod = (RefGeneTranscriptRecord) refGeneRod.getTemporaryAttribute("parsedRefGeneRod");
|
RefGeneTranscriptRecord parsedRefGeneRod = (RefGeneTranscriptRecord) refGeneRod.getTemporaryAttribute("parsedRefGeneRod");
|
||||||
if( parsedRefGeneRod == null ) {
|
if( parsedRefGeneRod == null ) {
|
||||||
parsedRefGeneRod = new RefGeneTranscriptRecord(refGeneRod);
|
parsedRefGeneRod = new RefGeneTranscriptRecord(refGeneRod);
|
||||||
|
|
@ -429,30 +508,30 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
|
||||||
if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
|
if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
|
||||||
System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
|
System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
|
||||||
//TODO figure out why all CDS lengths aren't in multiples of 3 nucleotides - Sarah says its ok to thow these out.
|
//discard transcripts where CDS length is not a multiple of 3
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//here, 5' and 3' are according to the DNA (not on the RNA)
|
|
||||||
int frame = 0; //the frame of the current position
|
|
||||||
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
|
|
||||||
int proteinOffset_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
|
|
||||||
int codonOffset_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
|
|
||||||
|
|
||||||
char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
|
char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
|
||||||
|
|
||||||
|
|
||||||
final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
|
final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
|
||||||
final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
|
final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
|
||||||
final int increment_5to3 = positiveStrand ? 1 : -1;
|
final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement
|
||||||
final int strandSign = increment_5to3; //alias
|
final int strandSign = increment_5to3; //alias
|
||||||
|
|
||||||
final long cdsStart_5prime = positiveStrand ? parsedRefGeneRod.cdsStart : parsedRefGeneRod.cdsEnd; //1-based, inclusive
|
final long cdsStart_5prime = positiveStrand ? parsedRefGeneRod.cdsStart : parsedRefGeneRod.cdsEnd; //1-based, inclusive
|
||||||
final long cdsEnd_3prime = positiveStrand ? parsedRefGeneRod.cdsEnd : parsedRefGeneRod.cdsStart ; //1-based, inclusive
|
final long cdsEnd_3prime = positiveStrand ? parsedRefGeneRod.cdsEnd : parsedRefGeneRod.cdsStart ; //1-based, inclusive
|
||||||
|
|
||||||
//compute chromosome key (later used to sort the output file)
|
int frame = 0; //the frame of the current position
|
||||||
|
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
|
||||||
|
int codonCount_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
|
||||||
|
int codingCoord_from5 = isProteinCodingTranscript ? parsedRefGeneRod.computeInitialCodingCoord() : 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
|
||||||
|
|
||||||
|
|
||||||
boolean mitochondrial = false;
|
boolean mitochondrial = false;
|
||||||
|
|
||||||
|
//compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
|
||||||
long chromKey;
|
long chromKey;
|
||||||
final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
|
final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
|
||||||
switch( Character.toLowerCase(lastChromChar) ) {
|
switch( Character.toLowerCase(lastChromChar) ) {
|
||||||
|
|
@ -474,6 +553,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
|
chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
|
||||||
chromKey *= 100000000000L;
|
chromKey *= 100000000000L;
|
||||||
|
|
||||||
|
PositionType positionType = null;
|
||||||
|
|
||||||
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
|
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
|
||||||
{
|
{
|
||||||
|
|
@ -483,19 +563,29 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
//whether the current txCoord_5to3 is within an exon
|
//whether the current txCoord_5to3 is within an exon
|
||||||
final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron
|
final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron
|
||||||
|
|
||||||
|
final int distanceToNearestSpliceSite = parsedRefGeneRod.computeDistanceToNearestSpliceSite(txCoord_5to3);
|
||||||
|
|
||||||
//figure out what region the current position is in.
|
//figure out what region the current position is in.
|
||||||
PositionType positionType;
|
|
||||||
if(isProteinCodingTranscript)
|
if(isProteinCodingTranscript)
|
||||||
{
|
{
|
||||||
if(isWithinExon)
|
if(isWithinExon)
|
||||||
{
|
{
|
||||||
if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) {
|
//update positionType (and codingCoord_from5 if needed)
|
||||||
|
if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //multiplying by strandSign is like doing absolute value.
|
||||||
positionType = PositionType.utr5;
|
positionType = PositionType.utr5;
|
||||||
} else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) {
|
} else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //multiplying by strandSign is like doing absolute value.
|
||||||
|
if(positionType != PositionType.utr3) {
|
||||||
|
//if we're transitioning from CDS to utr3, reset the coding coord to 1.
|
||||||
|
codingCoord_from5 = 1; //reset the coding coord for utr3
|
||||||
positionType = PositionType.utr3;
|
positionType = PositionType.utr3;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if(positionType != PositionType.CDS) {
|
||||||
|
//if we're transitioning from utr5 to CDS, reset the coding coord from -1 to 1.
|
||||||
|
codingCoord_from5 = 1;
|
||||||
positionType = PositionType.CDS;
|
positionType = PositionType.CDS;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
positionType = PositionType.intron;
|
positionType = PositionType.intron;
|
||||||
}
|
}
|
||||||
|
|
@ -514,11 +604,11 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
currentCodon_5to3 = new char[3];
|
currentCodon_5to3 = new char[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
codonOffset_from5++;
|
codonCount_from5++;
|
||||||
|
|
||||||
currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 - 1 ); //subtract 1 to go to zero-based coords
|
currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords
|
||||||
currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 );
|
currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 );
|
||||||
currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 + 1);
|
currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 + 1);
|
||||||
|
|
||||||
if(!positiveStrand) {
|
if(!positiveStrand) {
|
||||||
currentCodon_5to3[0] = BaseUtils.simpleComplement(currentCodon_5to3[0]);
|
currentCodon_5to3[0] = BaseUtils.simpleComplement(currentCodon_5to3[0]);
|
||||||
|
|
@ -527,9 +617,6 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
final String distanceToSpliceSiteString = Integer.toString(parsedRefGeneRod.distanceToSpliceSite(txCoord_5to3));
|
|
||||||
|
|
||||||
for(final char haplotypeAlternate : ALLELES )
|
for(final char haplotypeAlternate : ALLELES )
|
||||||
{
|
{
|
||||||
final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
|
final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
|
||||||
|
|
@ -545,55 +632,104 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
|
||||||
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
|
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
|
||||||
outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
|
outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
|
||||||
outputLineFields.put(OUTPUT_SPLICE_DISTANCE, distanceToSpliceSiteString );
|
outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) );
|
||||||
|
|
||||||
|
//comppute OUTPUT_SPLICE_INFO
|
||||||
|
final String spliceInfoString;
|
||||||
|
if(Math.abs(distanceToNearestSpliceSite) <= 10) {
|
||||||
|
if(distanceToNearestSpliceSite < 0) {
|
||||||
|
//is on the 5' side of the splice junction
|
||||||
|
if(isWithinExon) {
|
||||||
|
spliceInfoString = "splice-donor " + distanceToNearestSpliceSite;
|
||||||
|
} else {
|
||||||
|
spliceInfoString = "splice-acceptor " + distanceToNearestSpliceSite;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(isWithinExon) {
|
||||||
|
spliceInfoString = "splice-acceptor " + distanceToNearestSpliceSite;
|
||||||
|
} else {
|
||||||
|
spliceInfoString = "splice-donor " + distanceToNearestSpliceSite;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString);
|
||||||
|
}
|
||||||
|
|
||||||
if(isProteinCodingTranscript)
|
if(isProteinCodingTranscript)
|
||||||
{
|
{
|
||||||
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
|
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
|
||||||
|
|
||||||
if(isWithinExon) {
|
if(isWithinExon)
|
||||||
outputLineFields.put(OUTPUT_CODING_COORD, Integer.toString(proteinOffset_from5) );
|
{
|
||||||
|
if(positionType == PositionType.utr5)
|
||||||
if(positionType == PositionType.CDS) {
|
{
|
||||||
|
/*
|
||||||
|
if(THIS IS A uORF)
|
||||||
|
outputLineFields.put(OUTPUT_UORF_CHANGE, "TODO" ); //TODO put this here
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
else if(positionType == PositionType.CDS)
|
||||||
|
{
|
||||||
final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
||||||
outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) );
|
outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) );
|
||||||
outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonOffset_from5) );
|
outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5) );
|
||||||
outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ Integer.toString(codonOffset_from5) );
|
|
||||||
outputLineFields.put(OUTPUT_CODING_COORD_STR, "c."+ Integer.toString(proteinOffset_from5) );
|
|
||||||
|
|
||||||
|
final AminoAcid refAA = AminoAcidTable.getAA(referenceCodon, mitochondrial);
|
||||||
outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon );
|
outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon );
|
||||||
final String refAA = AminoAcidTable.getAA3LetterCode(referenceCodon, mitochondrial);
|
outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode());
|
||||||
outputLineFields.put(OUTPUT_REFERENCE_AA, refAA);
|
|
||||||
|
|
||||||
final char temp = currentCodon_5to3[frame];
|
final char temp = currentCodon_5to3[frame];
|
||||||
currentCodon_5to3[frame] = haplotypeAlternate;
|
currentCodon_5to3[frame] = haplotypeAlternate;
|
||||||
final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
||||||
final String variantAA = AminoAcidTable.getAA3LetterCode(variantCodon, mitochondrial);
|
final AminoAcid variantAA = AminoAcidTable.getAA(variantCodon, mitochondrial);
|
||||||
outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon );
|
outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon );
|
||||||
outputLineFields.put(OUTPUT_VARIANT_AA, variantAA);
|
outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode());
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter() ); //for example: "p.K76A"
|
||||||
|
|
||||||
currentCodon_5to3[frame] = temp;
|
currentCodon_5to3[frame] = temp;
|
||||||
|
|
||||||
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.equals(variantAA)));
|
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.getLetter().equals(variantAA.getLetter())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final StringBuilder codingCoordStr = new StringBuilder();
|
||||||
|
codingCoordStr.append( mitochondrial ? "m." : "c." );
|
||||||
|
if(positionType == PositionType.utr3) {
|
||||||
|
codingCoordStr.append( '*' );
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isWithinExon) {
|
||||||
|
codingCoordStr.append( Integer.toString(codingCoord_from5) );
|
||||||
|
} else {
|
||||||
|
//intronic coordinates
|
||||||
|
if(distanceToNearestSpliceSite < 0) {
|
||||||
|
codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) );
|
||||||
|
} else {
|
||||||
|
codingCoordStr.append( Integer.toString(codingCoord_from5 ) );
|
||||||
|
}
|
||||||
|
codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) );
|
||||||
|
}
|
||||||
|
codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
|
||||||
|
outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString() );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//write out the record
|
//write out the record
|
||||||
StringBuilder outputLine = new StringBuilder();
|
final StringBuilder outputLine = new StringBuilder();
|
||||||
for( String column : OUTPUT_COLUMN_NAMES) {
|
for( final String column : OUTPUT_COLUMN_NAMES ) {
|
||||||
if(outputLine.length() != 0) {
|
if(outputLine.length() != 0) {
|
||||||
outputLine.append(AnnotatorROD.DELIMITER);
|
outputLine.append( AnnotatorROD.DELIMITER );
|
||||||
}
|
}
|
||||||
String value = outputLineFields.get(column);
|
final String value = outputLineFields.get(column);
|
||||||
outputLine.append(value != null ? value : "");
|
outputLine.append(value != null ? value : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
outputStream.println(outputLine);
|
outputStream.println(outputLine);
|
||||||
|
|
||||||
if( !isProteinCodingTranscript ) {
|
if( !isProteinCodingTranscript ) {
|
||||||
//need only one record for this position, instead of 4 (one for each allele)
|
//need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -603,11 +739,12 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
txOffset_from5++;
|
txOffset_from5++;
|
||||||
|
|
||||||
if(positionType == PositionType.CDS) {
|
if(positionType == PositionType.CDS) {
|
||||||
proteinOffset_from5++;
|
|
||||||
frame = (frame + 1) % 3;
|
frame = (frame + 1) % 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(isWithinExon) {
|
||||||
|
codingCoord_from5++;
|
||||||
|
}
|
||||||
|
|
||||||
} // l for-loop
|
} // l for-loop
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue