Change specification of AnnotationInputTable, and fix 2 bugs.

Previous output spec contained 3 columns:
 haplotypeReference,haplotypeAlternate,haplotypeStrand
where haplotypeReference was always on the + strand, and haplotypeAlternate was on the strand specified by haplotypeStrand.

The new specification contains 3 columns:
 haplotypeReference,haplotypeAlternate,transcriptStrand
where haplotypeRef and haplotypeAlt are required to be on the + strand.  transcriptStrand now specifies the strand of the transcript, which is needed for interpreting the haplotypes.

Bugfix #1: fix incorrect assignment of variantCodon and variantAA
(Previously variantCodon was incorrectly set to referenceCodon)

Bugfix #2: fix incorrect codingCoordStr values for - strands (bug reported by Giulio Genovese), and incorrect usage of "m." for mitochondrial transcripts (bug reported by Steve Hershman)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4444 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
scalvo 2010-10-06 20:46:09 +00:00
parent b5c127e643
commit bda427f078
1 changed files with 36 additions and 30 deletions

View File

@ -84,9 +84,9 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
GenomicAnnotation.START_COLUMN,
GenomicAnnotation.END_COLUMN,
GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN,
GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN,
GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN };
GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN};
private static final String OUTPUT_TRANSCRIPT_STRAND = "transcriptStrand"; //rg. +/-
private static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true
private static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2
private static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic
@ -104,6 +104,7 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
private static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary
private static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5)
private static final String[] TRANSCRIPT_COLUMNS = {
OUTPUT_TRANSCRIPT_STRAND,
OUTPUT_POSITION_TYPE,
OUTPUT_FRAME,
OUTPUT_MRNA_COORD,
@ -466,12 +467,12 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(intronEnd));
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) );
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) );
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN, positiveStrand ? "+" : "-");
for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) {
outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] );
}
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" );
if ( isProteinCodingTranscript )
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
@ -550,6 +551,9 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
if(!positiveStrand) {
haplotypeReference = BaseUtils.simpleComplement(haplotypeReference); //txSequence contents depend on whether its +/- strand
}
char haplotypeReferenceStrandSpecific= positiveStrand ? haplotypeReference : BaseUtils.simpleComplement(haplotypeReference);
if(!BaseUtils.isRegularBase(haplotypeReference) && haplotypeReference != '*') { //* is special case for mitochondrial genes where polyA tail completes the last codon
//check for bad bases
@ -559,8 +563,10 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
}
char haplotypeAlternateStrandSpecific;
for(char haplotypeAlternate : ALLELES )
{
haplotypeAlternateStrandSpecific= positiveStrand ? haplotypeAlternate : BaseUtils.simpleComplement(haplotypeAlternate);
outputLineFields.clear();
if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) {
@ -574,12 +580,12 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(txCoord_5to3));
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( haplotypeReference ) );
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN, Character.toString( haplotypeAlternate ) );
outputLineFields.put(GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN, positiveStrand ? "+" : "-");
for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) {
outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] );
}
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" );
if(isWithinExon) {
outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(mrnaCoord_from5) );
}
@ -619,9 +625,9 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toUpperCase();
String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase();
String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternate).toUpperCase();
String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternate) + utr5NucBuffer_5to3[3]).toUpperCase();
String varCodon3 = (Character.toString(haplotypeAlternate) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase();
String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternateStrandSpecific).toUpperCase();
String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternateStrandSpecific) + utr5NucBuffer_5to3[3]).toUpperCase();
String varCodon3 = (Character.toString(haplotypeAlternateStrandSpecific) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase();
//check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF)
String uORFChangeStr = null;
@ -641,61 +647,61 @@ public class TranscriptToGenomicInfo extends RodWalker<Integer, Integer> {
outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr );
}
//compute CDS-specific fields
else if(positionType == PositionType.CDS)
{
else if (positionType == PositionType.CDS) {
final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
final char temp = currentCodon_5to3[frame];
currentCodon_5to3[frame] = haplotypeAlternateStrandSpecific;
final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
currentCodon_5to3[frame] = temp;
final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( referenceCodon, codonCount_from5 == 1 ) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ;
final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( variantCodon, codonCount_from5 == 1 ) : AminoAcidTable.getEukaryoticAA( variantCodon ) ;
final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(referenceCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ;
final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(variantCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( variantCodon ) ;
if ( refAA.isUnknown() || variantAA.isUnknown() ) {
if (refAA.isUnknown() || variantAA.isUnknown()) {
logger.warn("Illegal amino acid detected: refCodon=" + referenceCodon + " altCodon=" + variantCodon);
}
outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) );
outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5) );
outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon );
outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" );
outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame));
outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5));
outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon);
outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode());
final char temp = currentCodon_5to3[frame];
currentCodon_5to3[frame] = haplotypeAlternate;
outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon );
outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon);
outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode());
outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter() ); //for example: "p.K76A"
currentCodon_5to3[frame] = temp;
outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p." + refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter()); //for example: "p.K7$
boolean changesAA = !refAA.equals(variantAA);
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA));
final String functionalClass;
if(changesAA) {
if(variantAA.isStop()) {
functionalClass ="nonsense";
if (changesAA) {
if (variantAA.isStop()) {
functionalClass = "nonsense";
} else if (refAA.isStop()) {
functionalClass = "readthrough";
} else {
functionalClass ="missense";
functionalClass = "missense";
}
} else {
functionalClass ="silent";
functionalClass = "silent";
}
outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass);
}
//compute OUTPUT_CODING_COORD_STR
if(isProteinCodingTranscript)
{
//compute coding coord
final StringBuilder codingCoordStr = new StringBuilder();
codingCoordStr.append( isMitochondrialTranscript ? "m." : "c." );
codingCoordStr.append( "c." );
if(positionType == PositionType.utr3) {
codingCoordStr.append( '*' );
}
if(isWithinExon) {
codingCoordStr.append( Integer.toString(codingCoord_from5) );
codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
codingCoordStr.append ( haplotypeReferenceStrandSpecific + ">" + haplotypeAlternateStrandSpecific);
} else {
//intronic coordinates
if(distanceToNearestSpliceSite < 0) {