Finished implementing

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3273 348d0f76-0448-11de-a6fe-93d51630548a
2010-04-30 14:14:31 +00:00 · 2010-04-30 14:14:31 +00:00 · ba78d146ec
parent 5d5c7f9d34
commit ba78d146ec
1 changed files with 533 additions and 234 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
@ -1,12 +1,16 @@
 package org.broadinstitute.sting.playground.gatk.walkers.annotator;

-import java.io.BufferedOutputStream;
-import java.io.FileOutputStream;
-import java.io.PrintStream;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Date;
-import java.util.LinkedHashMap;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;

 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -19,21 +23,25 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
 import org.broadinstitute.sting.gatk.walkers.Reference;
 import org.broadinstitute.sting.gatk.walkers.Requires;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.gatk.walkers.TreeReducible;
 import org.broadinstitute.sting.gatk.walkers.Window;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.Utils;


 /**
- *  Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
+ *  Takes a UCSC refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
 *  annotations for each possible variant at each transcript position (eg. 4 variants for each position).
+ *
+ *  The map & reduce types are both TreeMap<String, String>
+ *  TreeMap entries represent lines in the output file. The TreeMap key is a combination of a given output line's position (so that this key can be used to sort all output lines
+ *  by reference order), as well as allele and gene name (so that its unique across all output lines). The String value is the output line itself.
 */
@Reference(window=@Window(start=-4,stop=4))
@By(DataSource.REFERENCE)
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="refgene",type=AnnotatorROD.class) })
-public class TranscriptToInfo extends RodWalker<Integer, Integer>
+public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap<String, String>> implements TreeReducible<TreeMap<String, String>>
 {

 /**
@ -57,12 +65,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
 */


-
-    public static final int OUTPUT_STREAM_BUFFER_SIZE = 8*1024*1024;
-
    /** Output columns */
-    public static final String OUTPUT_SORT_KEY = "00000_SORT_KEY";
-
    public static final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN;
    public static final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
    public static final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
@ -89,18 +92,17 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
    public static final String OUTPUT_VARIANT_AA = "variantAA";

    public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
+    public static final String OUTPUT_FUNCTIONAL_CLASS = "functionalClass"; //eg. missense

    public static final String OUTPUT_CODING_COORD_STR = "codingCoordStr";
    public static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr";

-    //public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
    public static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary
    public static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5)



    public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
-        OUTPUT_SORT_KEY,

        OUTPUT_CHRPOS,
        OUTPUT_HAPLOTYPE_REFERENCE,
@ -124,6 +126,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
        OUTPUT_VARIANT_CODON,
        OUTPUT_VARIANT_AA,
        OUTPUT_CHANGES_AMINO_ACID,
+        OUTPUT_FUNCTIONAL_CLASS,

        OUTPUT_CODING_COORD_STR,
        OUTPUT_PROTEIN_COORD_STR,
@ -137,7 +140,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
    /**
     * Container for all data fields from a single refGene.txt row.
     */
-    static class RefGeneTranscriptRecord
+    protected static class RefGeneTranscriptRecord
    {
        public static final String REFGENE_NAME = "name";  //eg. NM_021649
        public static final String REFGENE_STRAND = "strand";  //eg. +
@ -163,6 +166,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
         * Then, all calculations are performed at the end.
         */
        public StringBuilder txSequence;  //the sequence of the entire transcript in order from 5' to 3'
+        public StringBuilder utr5Sequence; //the protein coding sequence (with introns removed) in order from 5' to 3'
        public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3'

        public boolean positiveStrand;
@ -187,14 +191,15 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
            this.rod = refGeneRod;
            //String binStr = rod.get("bin");
            //String idStr = refGeneRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
-
            String strandStr = refGeneRod.get("strand");
-            if(strandStr.equals("+")) {
+            if(strandStr == null) {
+                throw new IllegalArgumentException("refGene record doesn't contain a 'strand' column. Make sure refGene input file has a header and the usual columns: \"" + strandStr + "\"");
+            } else if(strandStr.equals("+")) {
                positiveStrand = true;
            } else if(strandStr.equals("-")) {
                positiveStrand = false;
            } else {
-                throw new IllegalArgumentException("refGene record contains unexpected strand value: \"" + strandStr + "\"");
+                throw new IllegalArgumentException("refGene record contains unexpected value for 'strand' column: \"" + strandStr + "\"");
            }

            name = refGeneRod.get("name");
@ -208,15 +213,18 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
            txStart = loc.getStart();
            txEnd = loc.getStop();

-            txSequence = new StringBuilder( (int) (txEnd - txStart + 1) );  //the sequence of the entire transcript in order from 5' to 3'
-            cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
-
            String cdsStartStr = refGeneRod.get(REFGENE_CDS_START);
            String cdsEndStr = refGeneRod.get(REFGENE_CDS_END);

            cdsStart = Long.parseLong(cdsStartStr);
            cdsEnd = Long.parseLong(cdsEndStr);

+            txSequence = new StringBuilder( (int) (txEnd - txStart + 1) );  //the sequence of the entire transcript in order from 5' to 3'
+            if(isProteinCodingTranscript()) {
+                utr5Sequence = new StringBuilder( positiveStrand ? (int) (cdsStart - txStart + 1) : (int) (txEnd - cdsEnd + 1) ); //TODO reduce init size by size of introns
+                cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
+            }
+
            String exonCountStr = refGeneRod.get(REFGENE_EXON_COUNT);
            String exonStartsStr = refGeneRod.get(REFGENE_EXON_STARTS);
            String exonEndsStr = refGeneRod.get(REFGENE_EXON_ENDS);
@ -380,6 +388,8 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
    }


+    protected String outputFilename = null;
+
    protected int transcriptsProcessedCounter = 0;


@ -388,44 +398,37 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
        intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron
    }

-    /** Output stream for computed results */
-    private PrintStream outputStream;
-

    /**
     * Prepare the output file and the list of available features.
     */
    public void initialize() {
        //init the output file
-        String outputFilename = getToolkit().getArguments().outFileName;
-        if(outputFilename == null ) {
-            outputStream = System.out;
-        } else {
-            try {
-                outputStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFilename), OUTPUT_STREAM_BUFFER_SIZE));
-            } catch(Exception e) {
-                throw new StingException("Unable to open output file for writing: " + outputFilename);
-            }
+        outputFilename = getToolkit().getArguments().outFileName;
+        if(outputFilename == null) {
+            throw new StingException("Output file not specified. (Used -o command-line-arg)" );
+        }
+
+        if(!new File(outputFilename).canWrite()) {
+            throw new StingException("Unable to write to output file: " + outputFilename );
        }

-        outputStream.println(Utils.join(AnnotatorROD.DELIMITER, OUTPUT_COLUMN_NAMES));
    }

    /**
-     * Initialize the number of loci processed to zero.
+     * Initialize the output filename.
     *
     * @return 0
     */
-    public Integer reduceInit() { return 0; }
-
-
-    /**
-     * We want reads that span deletions
-     *
-     * @return true
-     */
-    public boolean includeReadsWithDeletionAtLoci() { return true; }
+    public TreeMap<String, String> reduceInit()
+    {
+        return new TreeMap<String, String>() {

+            public String toString() {
+                return size() + " total entries";
+            }
+        };
+    }


    /**
@ -436,18 +439,19 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
     * @param context  the context for the given locus
     * @return 1 if the locus was successfully processed, 0 if otherwise
     */
-    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    public TreeMap<String, String> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
        if ( tracker == null )
-            return 0;
+            return null;


        final Collection<RODRecordList> rods = tracker.getBoundRodTracks();
        if(rods.size() == 0) {
            //if there's nothing overlapping this locus, skip it.
-            return 0;
+            return null;
        }

-
+        final TreeMap<String, String> result = new TreeMap<String, String>();


        final List<Object> refGeneRODs = (List<Object>) tracker.getReferenceMetaData("refgene");
@ -464,57 +468,105 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
                refGeneRod.setTemporaryAttribute("parsedRefGeneRod", parsedRefGeneRod);
            }

+            //populate the parsedRefGeneRod.positiveStrand data structure
            if(parsedRefGeneRod.positiveStrand) {
                parsedRefGeneRod.txSequence.append(ref.getBase());
            } else {
                parsedRefGeneRod.txSequence.insert(0, ref.getBase());
            }

+            //continue populating the parsedRefGeneRod.utr5Sequence and  parsedRefGeneRod.cdsSequence data structures
            final long position = ref.getLocus().getStart();
-            if(parsedRefGeneRod.isProteinCodingTranscript() && position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd && parsedRefGeneRod.isWithinExon(position) ) {
-                if(parsedRefGeneRod.positiveStrand) {
-                    parsedRefGeneRod.cdsSequence.append(ref.getBase());
-                } else {
-                    parsedRefGeneRod.cdsSequence.insert(0, ref.getBase());
+            if(parsedRefGeneRod.isProteinCodingTranscript() && parsedRefGeneRod.isWithinExon(position) )
+            {
+                //we're within an exon of a proteinCodingTranscript
+
+                if(parsedRefGeneRod.positiveStrand)
+                {
+                    if(position < parsedRefGeneRod.cdsStart)
+                    {
+                        parsedRefGeneRod.utr5Sequence.append(ref.getBase()); //within utr5
+                    }
+                    else if(position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd)
+                    {
+                        parsedRefGeneRod.cdsSequence.append(ref.getBase()); //within CDS
+                    }
+                }
+                else
+                {
+                    if(position > parsedRefGeneRod.cdsEnd)
+                    {
+                        //As we more left-to-right (aka. 3' to 5'), we do insert(0,..) to reverse the sequence so that it become 5' to 3' in parsedRefGeneRod.utr5Sequence.
+                        parsedRefGeneRod.utr5Sequence.insert(0,ref.getBase()); //within utr5.
+                    }
+                    else if(position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd)
+                    {
+                        parsedRefGeneRod.cdsSequence.insert(0,ref.getBase()); //within CDS
+                    }
                }
            }

-            if(position == parsedRefGeneRod.txEnd) {
+            if(position == parsedRefGeneRod.txEnd)
+            {
                //we've reached the end of the transcript - compute all data and write it out.
-                generateOutputRecordsForROD(parsedRefGeneRod);
+                try
+                {
+                    generateOutputRecordsForROD(parsedRefGeneRod, result);
+                }
+                catch(Exception e)
+                {
+                    logger.error(Thread.currentThread().getName() + " - Unexpected error occurred at position: [" + parsedRefGeneRod.txChrom + ":" + position + "] in transcript: " + parsedRefGeneRod, e);
+                }

                transcriptsProcessedCounter++;
                if(transcriptsProcessedCounter % 100 == 0) {
-                    System.err.println(new Date() + ": " +  transcriptsProcessedCounter + " transcripts processed");
+                    logger.info(new Date() + ": " +  transcriptsProcessedCounter + " transcripts processed");
                }
            }
        }

-        return 1;
+        return result;
    }


    private static final char[] ALLELES = {'A','C','G','T'};

+    private static final String OUTPUT_HEADER_LINE;
+
+    static {
+        StringBuilder outputHeaderLine = new StringBuilder();
+        for( final String column : OUTPUT_COLUMN_NAMES ) {
+            if(outputHeaderLine.length() != 0) {
+                outputHeaderLine.append( AnnotatorROD.DELIMITER );
+            }
+            outputHeaderLine.append(column);
+        }
+
+        OUTPUT_HEADER_LINE = outputHeaderLine.toString();
+    }


-    private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod) {
-
+    /**
+     * Returns the filename.
+     *
+     * @param parsedRefGeneRod
+     * @param result
+     * @return
+     */
+    private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod, TreeMap<String, String> result) throws IOException
+    {
        //Transcripts that don't produce proteins are indicated in refGene by cdsStart == cdsEnd
        //These will be handled by generating only one record, with haplotypeAlternate == "*".
        final boolean isProteinCodingTranscript = parsedRefGeneRod.isProteinCodingTranscript();

-        final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //shorten the name
+        final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //alias

        if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
-            System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
+            logger.warn("WARNING: Transcript " + parsedRefGeneRod.name +" at position ["+ parsedRefGeneRod.txChrom + ":" +parsedRefGeneRod.txStart + "-" + parsedRefGeneRod.txEnd + "] has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping...");
            //discard transcripts where CDS length is not a multiple of 3
            return;
        }

-
-        char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
-
        final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
        final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
        final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement
@ -525,67 +577,52 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>

        int frame = 0; //the frame of the current position
        int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
-        int codonCount_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
-        int codingCoord_from5 = isProteinCodingTranscript ? parsedRefGeneRod.computeInitialCodingCoord() : 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
+        int utr5Count_from5 = 0;
+        char[] utr5NucBuffer_5to3 = null; //used to find uORFs - size = 5 because to hold the 3 codons that overlap any given position: [-2,-1,0], [-1,0,1], and [0,1,2]

+        int codonCount_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons
+        int codingCoord_from5 = isProteinCodingTranscript ? parsedRefGeneRod.computeInitialCodingCoord() : -1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
+        boolean codingCoordResetForCDS = false;
+        boolean codingCoordResetForUtr3 = false;
+        final char[] currentCodon_5to3 = isProteinCodingTranscript ? new char[3] : null; //holds the current RNA codon - 5' to 3'

-        boolean mitochondrial = false;
-
-        //compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
-        long chromKey;
-        final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
-        switch( Character.toLowerCase(lastChromChar) ) {
-        case 'm':
-            chromKey = 0;
-            mitochondrial = true;
-            break;
-        case 'x':
-            chromKey = 23;
-            break;
-        case 'y':
-            chromKey = 24;
-            break;
-        default:
-            chromKey = Integer.parseInt(parsedRefGeneRod.txChrom.substring(3));
-            break;
-        }
-
-        chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
-        chromKey *= 100000000000L;
+        final boolean mitochondrial = parsedRefGeneRod.txChrom.toLowerCase().startsWith("chrm");

        PositionType positionType = null;
+        boolean isWithinIntronAndFarFromSpliceJunction = false;
+        long intronStart_5prime = -1;
+        long intronEnd_5prime = -1;
+
+        final Map<String, String> outputLineFields = new HashMap<String, String>();

        for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
        {
-            //get the reference sequence
-            final char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );

-            //whether the current txCoord_5to3 is within an exon
+            //compute certain attributes of the current position
            final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron

            final int distanceToNearestSpliceSite = parsedRefGeneRod.computeDistanceToNearestSpliceSite(txCoord_5to3);
+            final boolean isWithin10bpOfSpliceJunction = Math.abs(distanceToNearestSpliceSite) <= 10;

-            //figure out what region the current position is in.
+
+            //increment coding coord is necessary
+            if(isWithinExon) {
+                codingCoord_from5++;
+            }
+
+            //figure out the current positionType
+            final PositionType prevPositionType = positionType; //save the position before it is updated
            if(isProteinCodingTranscript)
            {
                if(isWithinExon)
                {
-                    //update positionType (and codingCoord_from5 if needed)
-                    if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) {  //multiplying by strandSign is like doing absolute value.
+                    if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) {  //utr5  (multiplying by strandSign is like doing absolute value.)
                        positionType = PositionType.utr5;
-                    } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) {  //multiplying by strandSign is like doing absolute value.
-                        if(positionType != PositionType.utr3) {
-                            //if we're transitioning from CDS to utr3, reset the coding coord to 1.
-                            codingCoord_from5 = 1; //reset the coding coord for utr3
+                    } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) {  //utr3  (multiplying by strandSign is like doing absolute value.)
                        positionType = PositionType.utr3;
-                        }
                    } else {
-                        if(positionType != PositionType.CDS) {
-                            //if we're transitioning from utr5 to CDS, reset the coding coord from -1 to 1.
-                            codingCoord_from5 = 1;
                        positionType = PositionType.CDS;
                    }
-                    }
                } else {
                    positionType = PositionType.intron;
                }
@ -597,15 +634,119 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
                }
            }

-            //compute current codon
-            if(positionType == PositionType.CDS && frame == 0)
-            {
-                if(currentCodon_5to3 == null) {
-                    currentCodon_5to3 = new char[3];
+            //handle transitions
+            if(positionType == PositionType.CDS && prevPositionType != PositionType.CDS && !codingCoordResetForCDS) {
+                //transitioning from utr5 to CDS, reset the coding coord from -1 to 1.
+                codingCoord_from5 = 1;
+                codingCoordResetForCDS = true;
+            } else if(positionType == PositionType.utr3 && prevPositionType != PositionType.utr3 && !codingCoordResetForUtr3) {
+                //transitioning from CDS to utr3, reset the coding coord to 1.
+                codingCoord_from5 = 1;
+                codingCoordResetForUtr3 = true;
            }

-                codonCount_from5++;

+            try
+            {
+
+                //handle introns
+                boolean wasWithinIntronAndFarFromSpliceJunction = isWithinIntronAndFarFromSpliceJunction;
+                isWithinIntronAndFarFromSpliceJunction = !isWithinExon && !isWithin10bpOfSpliceJunction;
+
+                if(!wasWithinIntronAndFarFromSpliceJunction && isWithinIntronAndFarFromSpliceJunction) {
+                    //save intron start
+                    intronStart_5prime = txCoord_5to3;
+
+                } else if(wasWithinIntronAndFarFromSpliceJunction && !isWithinIntronAndFarFromSpliceJunction) {
+                    //output intron record
+                    intronEnd_5prime = txCoord_5to3 - increment_5to3;
+
+                    final long intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ;
+                    final long intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime);
+                    outputLineFields.clear();
+                    outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + intronStart +  "-" + intronEnd);
+                    outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) );
+                    outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, Character.toString( '*' ) );
+                    outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
+                    outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
+                    outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
+
+                    outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
+
+                    if(isProteinCodingTranscript)
+                    {
+                        outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
+                    }
+
+                    final String sortKey = computeSortKey(parsedRefGeneRod.txChrom, intronStart, '*', parsedRefGeneRod);
+
+                    addThisLineToResult(sortKey, outputLineFields, result);
+                }
+
+
+                //when in utr5, compute the utr5NucBuffer_5to3 which is later used to compute the OUTPUT_UORF_CHANGE field
+                if(positionType == PositionType.utr5)
+                {
+                    if(utr5Count_from5 < parsedRefGeneRod.utr5Sequence.length())
+                    {
+                        if(utr5NucBuffer_5to3 == null) {
+                            //initialize
+                            utr5NucBuffer_5to3 = new char[5];
+                            utr5NucBuffer_5to3[3] = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 );
+                            if(!positiveStrand) {
+                                utr5NucBuffer_5to3[3] = BaseUtils.simpleComplement(utr5NucBuffer_5to3[3]);
+                            }
+
+                            if(utr5Count_from5 + 1 < parsedRefGeneRod.utr5Sequence.length() ) {
+                                utr5NucBuffer_5to3[4] = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 + 1 );
+                                if(!positiveStrand) {
+                                    utr5NucBuffer_5to3[4] = BaseUtils.simpleComplement(utr5NucBuffer_5to3[4]);
+                                }
+                            }
+                        }
+
+                        //as we move 5' to 3', shift nucleotides down to the 5' end, making room for the new 3' nucleotide:
+                        utr5NucBuffer_5to3[0] = utr5NucBuffer_5to3[1];
+                        utr5NucBuffer_5to3[1] = utr5NucBuffer_5to3[2];
+                        utr5NucBuffer_5to3[2] = utr5NucBuffer_5to3[3];
+                        utr5NucBuffer_5to3[3] = utr5NucBuffer_5to3[4];
+
+                        char nextRefBase = 0;
+                        if( utr5Count_from5 + 2 < parsedRefGeneRod.utr5Sequence.length() )
+                        {
+                            nextRefBase = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 + 2 );
+
+                            //reverse complement if necessary
+                            if(!positiveStrand) {
+                                nextRefBase = BaseUtils.simpleComplement(nextRefBase);
+                            }
+                        }
+                        utr5NucBuffer_5to3[4] = nextRefBase;
+
+                        //check for bad bases
+                        if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) ||
+                            (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) ||
+                            (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) ||
+                            (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) ||
+                            (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4])))
+                        {
+                            logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedRefGeneRod);
+                            continue;
+                        }
+
+                    } else { // if(utr5Count_from5 >= parsedRefGeneRod.utr5Sequence.length())
+                        //defensive programming
+                        logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". utr5Count_from5 is now " + utr5Count_from5 + ", while parsedRefGeneRod.utr5Sequence.length() == " + parsedRefGeneRod.utr5Sequence.length() + ". This means parsedRefGeneRod.utr5Sequence isn't as long as it should be. This is a bug in handling this record: " + parsedRefGeneRod);
+                        continue;
+                    }
+                }
+
+
+                //when in CDS, compute current codon
+                if(positionType == PositionType.CDS)
+                {
+                    if(frame == 0)
+                    {
                        currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords
                        currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 );
                        currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 + 1);
@ -617,15 +758,34 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
                        }
                    }

-            for(final char haplotypeAlternate : ALLELES )
+                    //check for bad bases
+                    if(!BaseUtils.isRegularBase(currentCodon_5to3[0]) || !BaseUtils.isRegularBase(currentCodon_5to3[1]) || !BaseUtils.isRegularBase(currentCodon_5to3[2])) {
+                        logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". CDS codon contains irregular base:" + currentCodon_5to3[0] + currentCodon_5to3[1] + currentCodon_5to3[2]);// +". Transcript is: " + parsedRefGeneRod);
+                        continue;
+                    }
+
+                }
+
+                char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );
+                if(!BaseUtils.isRegularBase(haplotypeReference)) {
+                    //check for bad bases
+                    logger.error("Exception: Current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name + " contains an irregular base:" + haplotypeReference); // +". Transcript is: " + parsedRefGeneRod);
+                    continue;
+                }
+
+                for(char haplotypeAlternate : ALLELES )
                {
-                final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
+                    outputLineFields.clear();

-                outputLineFields.put(OUTPUT_SORT_KEY,  Long.toString( chromKey + txCoord_5to3) );
+                    if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) {
+                        haplotypeReference = '*';
+                        haplotypeAlternate = '*';
+                    }

+                    //compute simple OUTPUT fields.
                    outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + txCoord_5to3);
                    outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( haplotypeReference ) );
-                outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, isProteinCodingTranscript ? Character.toString( haplotypeAlternate ) : "*");
+                    outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, Character.toString( haplotypeAlternate ) );
                    outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
                    outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
                    outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
@ -634,9 +794,9 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
                    outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
                    outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) );

-                //comppute OUTPUT_SPLICE_INFO
+                    //compute OUTPUT_SPLICE_INFO
                    final String spliceInfoString;
-                if(Math.abs(distanceToNearestSpliceSite) <= 10) {
+                    if(isWithin10bpOfSpliceJunction) {
                        if(distanceToNearestSpliceSite < 0) {
                            //is on the 5' side of the splice junction
                            if(isWithinExon) {
@ -654,19 +814,42 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
                        outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString);
                    }

+                    //compute OUTPUT_IN_CODING_REGION
                    if(isProteinCodingTranscript)
                    {
                        outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
+                    }

-                    if(isWithinExon)
-                    {
+
+                    //compute OUTPUT_UORF_CHANGE
                    if(positionType == PositionType.utr5)
                    {
-                            /*
-                            if(THIS IS A uORF)
-                            outputLineFields.put(OUTPUT_UORF_CHANGE, "TODO" ); //TODO put this here
-                            */
+                        String refCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + utr5NucBuffer_5to3[2]).toLowerCase();
+                        String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toLowerCase();
+                        String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toLowerCase();
+
+                        String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternate).toLowerCase();
+                        String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternate) + utr5NucBuffer_5to3[3]).toLowerCase();
+                        String varCodon3 = (Character.toString(haplotypeAlternate) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toLowerCase();
+
+                        //check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF)
+                        String uORFChangeStr = null;
+                        if( (refCodon1.equals("atg") && !varCodon1.equals("atg")) ||
+                            (refCodon2.equals("atg") && !varCodon2.equals("atg")) ||
+                            (refCodon3.equals("atg") && !varCodon3.equals("atg")))
+                        {
+                            uORFChangeStr = "-1";
                        }
+                        else if((varCodon1.equals("atg") && !refCodon1.equals("atg")) ||
+                                (varCodon2.equals("atg") && !refCodon2.equals("atg")) ||
+                                (varCodon3.equals("atg") && !refCodon3.equals("atg")))
+                        {
+                            uORFChangeStr = "+1";
+                        }
+
+                        outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr );
+                    }
+                    //compute CDS-specific fields
                    else if(positionType == PositionType.CDS)
                    {
                        final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
@ -688,11 +871,26 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>

                        currentCodon_5to3[frame] = temp;

-                            outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.getLetter().equals(variantAA.getLetter())));
+                        boolean changesAA = !refAA.equals(variantAA);
+                        outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA));
+                        final String functionalClass;
+                        if(changesAA) {
+                            if(variantAA.isStop()) {
+                                functionalClass ="nonsense";
+                            } else {
+                                functionalClass ="missense";
                            }
+                        } else {
+                            functionalClass ="silent";
+                        }
+                        outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass);
                    }


+                    //compute OUTPUT_CODING_COORD_STR
+                    if(isProteinCodingTranscript)
+                    {
+                        //compute coding coord
                        final StringBuilder codingCoordStr = new StringBuilder();
                        codingCoordStr.append( mitochondrial ? "m." : "c." );
                        if(positionType == PositionType.utr3) {
@ -701,79 +899,180 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>

                        if(isWithinExon) {
                            codingCoordStr.append( Integer.toString(codingCoord_from5) );
+                            codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
                        } else {
                            //intronic coordinates
                            if(distanceToNearestSpliceSite < 0) {
                                codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) );
                            } else {
                                codingCoordStr.append( Integer.toString(codingCoord_from5 ) );
+                                codingCoordStr.append( "+" );
                            }
+
                            codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) );
                        }
-                    codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
-                    outputLineFields.put(OUTPUT_CODING_COORD_STR,  codingCoordStr.toString() );

+                        outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString());
                    }


-                //write out the record
-                final StringBuilder outputLine = new StringBuilder();
-                for( final String column : OUTPUT_COLUMN_NAMES ) {
-                    if(outputLine.length() != 0) {
-                        outputLine.append( AnnotatorROD.DELIMITER );
-                    }
-                    final String value = outputLineFields.get(column);
-                    outputLine.append(value != null ? value : "");
+                    //generate the output line and add it to 'result' map.
+                    if(!isWithinIntronAndFarFromSpliceJunction) {
+                        final String sortKey = computeSortKey(parsedRefGeneRod.txChrom, txCoord_5to3, haplotypeAlternate, parsedRefGeneRod);
+
+                        addThisLineToResult(sortKey, outputLineFields, result);
                    }

-                outputStream.println(outputLine);
-
-                if( !isProteinCodingTranscript ) {
+                    if( haplotypeAlternate == '*' ) {
                        //need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles
                        break;
                    }

                } //ALLELE for-loop
-
-
+            }
+            finally
+            {
+                //increment coords
                txOffset_from5++;

-            if(positionType == PositionType.CDS) {
+                if(positionType == PositionType.utr5) {
+                    utr5Count_from5++;
+                } else if(positionType == PositionType.CDS) {
                    frame = (frame + 1) % 3;
+                    if(frame == 0) {
+                        codonCount_from5++;
+                    }
                }
-
-            if(isWithinExon) {
-                codingCoord_from5++;
            }
-
        } // l for-loop

    } //method close


    /**
-     * Increment the number of loci processed.
+     * Utility method. Creates a line containing the outputLineFields, and adds it to result, hashed by the sortKey.
     *
-     * @param value result of the map.
-     * @param sum   accumulator for the reduce.
-     * @return the new number of loci processed.
+     * @param sortKey  Reference-order sort key
+     * @param outputLineFields Column-name to value pairs.
+     * @param result  The hashmap that accumulates all results.
     */
-    public Integer reduce(Integer value, Integer sum) {
-        return sum + value;
+    private void addThisLineToResult(final String sortKey, final Map<String, String> outputLineFields, TreeMap<String, String> result) {
+        final StringBuilder outputLine = new StringBuilder();
+        for( final String column : OUTPUT_COLUMN_NAMES ) {
+            if(outputLine.length() != 0) {
+                outputLine.append( AnnotatorROD.DELIMITER );
+            }
+            final String value = outputLineFields.get(column);
+            if(value != null) {
+                outputLine.append(value);
+            }
+        }
+
+        result.put(sortKey, outputLine.toString());
+    }
+
+
+
+    /**
+     * Merge the "value" file with the "sum" file and return the file name
+     * of the merged data set.
+     */
+    @Override
+    public TreeMap<String, String> reduce(TreeMap<String, String> value, TreeMap<String, String> sum)
+    {
+        if(value != null) {
+            sum.putAll(value);
+        }
+
+        return sum;
+    }
+
+    @Override
+    public TreeMap<String, String> treeReduce(TreeMap<String, String> lhs, TreeMap<String, String> rhs)
+    {
+        lhs.putAll(rhs);
+        return lhs;
+    }
+
+
+
+    /**
+     * Computes a reference-order sort key for a record with the given
+     * UCSC chromosome name, position, and allele at that position.
+     *
+     * @param txChrom
+     * @param position
+     * @param allele
+     * @param parsedRefGeneRod
+     * @return
+     */
+    protected String computeSortKey(String txChrom, long position, char allele, RefGeneTranscriptRecord parsedRefGeneRod)
+    {
+        //compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
+        long key;
+        final char lastChromChar = txChrom.charAt(txChrom.length() -1);
+        switch( Character.toLowerCase(lastChromChar) ) {
+        case 'm':
+            key = 0;
+            break;
+        case 'x':
+            key = 23;
+            break;
+        case 'y':
+            key = 24;
+            break;
+        default:
+            key = Integer.parseInt(txChrom.substring(3));
+            break;
+        }
+
+        key++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
+        key += 100; //add a leading 1 to avoid having to create leading zeros
+        key *= 10*1000L*1000L*1000L*1000L; //this way position doesn't overlap with chromosome keys
+        key += position*1000L;
+        key +=   allele;
+
+        String result = Long.toString(key);
+        if(parsedRefGeneRod != null) {
+            result += (parsedRefGeneRod.name + parsedRefGeneRod.name2);
+        }
+        return result;
    }

    /**
-     * Tell the user the number of loci processed and close out the new variants file.
+     * Moves the file to the destination directory.
     *
-     * @param result  the number of loci seen.
+     * @param result The file of the result
     */
-    public void onTraversalDone(Integer result) {
-        out.printf("Processed %d loci.\n", result);
-
-        outputStream.close();
+    public void onTraversalDone(TreeMap<String, String> result)
+    {
+        if(result == null) {
+            logger.info("Result is: null. No output file was generated.\n");
+            return;
        }

+        //move the fully merged temp file to the output file.
+        logger.info("Result - writing " + result.size() + " lines to: " + outputFilename);
+        BufferedWriter fileWriter  = null;
+        try {
+            fileWriter = new BufferedWriter(new FileWriter(outputFilename));
+            fileWriter.write(OUTPUT_HEADER_LINE + '\n');
+            for(String value : result.values() ) {
+                fileWriter.write(value + "\n");
+            }
+        } catch(IOException e) {
+            logger.info("Unable to write to file: " + outputFilename);
+        } finally {
+            try {
+                if (fileWriter != null)
+                    fileWriter.close();
+            } catch(IOException e) {
+                logger.info("Unable to write to file: " + outputFilename);
+            }
+        }

+        logger.info("Deleting temp files..");
+    }

 }