From b6989289fc57fea96a872659640673ebd8ce6095 Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 21 Aug 2010 12:09:33 +0000 Subject: [PATCH] Potential bug fix for bad references where some codons may have Ns git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4075 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/annotator/AminoAcid.java | 5 ++ .../walkers/annotator/AminoAcidTable.java | 16 ++--- .../walkers/annotator/TranscriptToInfo.java | 63 ++++++++++--------- 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java index b0f060bc4..54c2ac1ef 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java @@ -82,6 +82,11 @@ public class AminoAcid { return "*".equals(getLetter()); } + /** Returns true if the amino acid is really just a stop codon. */ + public boolean isUnknown() { + return "X".equals(getLetter()); + } + public String toString() { return name; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcidTable.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcidTable.java index d7d395ba0..41541b01c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcidTable.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcidTable.java @@ -34,6 +34,7 @@ import java.util.HashMap; public class AminoAcidTable { + protected static final AminoAcid UNKNOWN = new AminoAcid("X" , "Unknown", "Unk"); protected static final AminoAcid ISOLEUCINE = new AminoAcid("I" , "Isoleucine", "Ile"); protected static final AminoAcid LEUCINE = new AminoAcid("L" , "Leucine", "Leu"); protected static final AminoAcid VALINE = new AminoAcid("V" , "Valine", "Val"); @@ -177,24 +178,17 @@ public class AminoAcidTable { mitochondrialAminoAcidTable.put("TGA", TRYPTOPHAN); } - - - /** * Returns the amino acid encoded by the given codon in a eukaryotic genome. * * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. * - * @return The amino acid matching the given codon. + * @return The amino acid matching the given codon, or the UNKNOWN amino acid if the codon string doesn't match anything */ public static AminoAcid getEukaryoticAA(String codon) { codon = codon.toUpperCase(); final AminoAcid aa = aminoAcidTable.get(codon); - if(aa == null) { - throw new IllegalArgumentException("Invalid codon: " + codon); - } else { - return aa; - } + return aa == null ? UNKNOWN : aa; } @@ -204,13 +198,13 @@ public class AminoAcidTable { * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. * @param isFirstCodon If this is the 1st codon in the gene, then "ATT" encodes Methyonine * - * @return The amino acid matching the given codon in mitochondrial genes. + * @return The amino acid matching the given codon in mitochondrial genes, or the UNKNOWN amino acid if the codon string doesn't match anything */ public static AminoAcid getMitochondrialAA(String codon, boolean isFirstCodon) { codon = codon.toUpperCase(); final AminoAcid aa = mitochondrialAminoAcidTable.get(codon); if(aa == null) { - throw new IllegalArgumentException("Invalid codon: " + codon); + return UNKNOWN; } else if(isFirstCodon && codon.equals("ATT")) { return METHIONINE; //special case - 'ATT' in the first codon of a mitochondrial gene codes for methionine instead of isoleucine } else { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java index 4518b33d0..4f4c20cdf 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java @@ -620,10 +620,10 @@ public class TranscriptToInfo extends RodWalker, TreeMap //check for bad bases if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) || - (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) || - (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) || - (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) || - (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4]))) + (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) || + (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) || + (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) || + (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4]))) { logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedTranscriptRod); ++skippedPositionsCounter; @@ -735,8 +735,8 @@ public class TranscriptToInfo extends RodWalker, TreeMap //check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF) String uORFChangeStr = null; if( (refCodon1.equals("ATG") && !varCodon1.equals("ATG")) || - (refCodon2.equals("ATG") && !varCodon2.equals("ATG")) || - (refCodon3.equals("ATG") && !varCodon3.equals("ATG"))) + (refCodon2.equals("ATG") && !varCodon2.equals("ATG")) || + (refCodon3.equals("ATG") && !varCodon3.equals("ATG"))) { uORFChangeStr = "-1"; } @@ -753,17 +753,22 @@ public class TranscriptToInfo extends RodWalker, TreeMap else if(positionType == PositionType.CDS) { final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) ); - outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5) ); + final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( referenceCodon, codonCount_from5 == 1 ) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ; + final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( variantCodon, codonCount_from5 == 1 ) : AminoAcidTable.getEukaryoticAA( variantCodon ) ; + + if ( refAA.isUnknown() || variantAA.isUnknown() ) { + logger.warn("Illegal amino acid detected: refCodon=" + referenceCodon + " altCodon=" + variantCodon); + } + + outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) ); + outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5) ); outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon ); outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode()); final char temp = currentCodon_5to3[frame]; currentCodon_5to3[frame] = haplotypeAlternate; - final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( variantCodon, codonCount_from5 == 1 ) : AminoAcidTable.getEukaryoticAA( variantCodon ) ; outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon ); outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode()); @@ -915,23 +920,23 @@ public class TranscriptToInfo extends RodWalker, TreeMap long key; final char lastChromChar = txChrom.charAt(txChrom.length() -1); switch( Character.toLowerCase(lastChromChar) ) { - case 'm': - case 't': // for hg19 or b36 - key = 0; - break; - case 'x': - key = 23; - break; - case 'y': - key = 24; - break; - default: - if( txChrom.startsWith("chr") ) { - key = Integer.parseInt(txChrom.substring(3)); - } else { - key = Integer.parseInt(txChrom); - } - break; + case 'm': + case 't': // for hg19 or b36 + key = 0; + break; + case 'x': + key = 23; + break; + case 'y': + key = 24; + break; + default: + if( txChrom.startsWith("chr") ) { + key = Integer.parseInt(txChrom.substring(3)); + } else { + key = Integer.parseInt(txChrom); + } + break; } key++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step) @@ -1108,8 +1113,8 @@ public class TranscriptToInfo extends RodWalker, TreeMap //exonFrames = new int[exonCount]; for(int i = 0; i < exonCount; i++) { exonStarts[i] = Integer.parseInt(exonStartStrs[i]); - exonEnds[i] = Integer.parseInt(exonEndStrs[i]); - //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); + exonEnds[i] = Integer.parseInt(exonEndStrs[i]); + //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); } }