From e643a9e7a5b12910f5d092f4a3c497c7d2ea0531 Mon Sep 17 00:00:00 2001
From: weisburd <weisburd@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Thu, 22 Apr 2010 12:11:19 +0000
Subject: [PATCH] Takes a refGene table ( -B arg must be: -B
 refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big
 table of nucleotides containing annotations for each possible variant at each
 transcript position (eg. 4 variants for each position).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3238 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../walkers/annotator/TranscriptToInfo.java   | 642 ++++++++++++++++++
 1 file changed, 642 insertions(+)
 create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java

diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
new file mode 100755
index 000000000..d3c8706f8
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java
@@ -0,0 +1,642 @@
+package org.broadinstitute.sting.playground.gatk.walkers.annotator;
+
+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.util.Collection;
+import java.util.Date;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
+import org.broadinstitute.sting.gatk.walkers.By;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.RMD;
+import org.broadinstitute.sting.gatk.walkers.Reference;
+import org.broadinstitute.sting.gatk.walkers.Requires;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.gatk.walkers.Window;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.Utils;
+
+
+/**
+ *  Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
+ *  annotations for each possible variant at each transcript position (eg. 4 variants for each position).
+ */
+@Reference(window=@Window(start=-4,stop=4))
+@By(DataSource.REFERENCE)
+@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="refgene",type=AnnotatorROD.class) })
+public class TranscriptToInfo extends RodWalker<Integer, Integer>
+{
+
+/**
+   RefGene column names:
+    # bin           182                     smallint(5) unsigned    range    Indexing field to speed chromosome range queries.
+    # name          NM_021649               varchar(255)            values   Name of gene (usually transcript_id from GTF)
+    # chrom         chr5                    varchar(255)            values   Reference sequence chromosome or scaffold
+    # strand        -                       char(1)                 values   + or - for strand
+    # txStart       114942238               int(10) unsigned        range    Transcription start position
+    # txEnd         114966008               int(10) unsigned        range    Transcription end position
+    # cdsStart      114944144               int(10) unsigned        range    Coding region start
+    # cdsEnd        114944852               int(10) unsigned        range    Coding region end
+    # exonCount     2                       int(10) unsigned        range    Number of exons
+    # exonStarts    114942238,114965692,    longblob                         Exon start positions
+    # exonEnds      114944911,114966008,    longblob                         Exon end positions
+    # id            0                       int(10) unsigned        range    Unique identifier
+    # name2         TICAM2                  varchar(255)            values   Alternate name (e.g. gene_id from GTF)
+    # cdsStartStat  cmpl                    enum('none','unk','incmpl','cmpl')    values    enum('none','unk','incmpl','cmpl')
+    # cdsEndStat    cmpl                    enum('none','unk','incmpl','cmpl')    values    enum('none','unk','incmpl','cmpl')
+    # exonFrames    0,-1,                   longblob                         Exon frame {0,1,2}, or -1 if no frame for exon
+*/
+
+
+
+    public static final int OUTPUT_STREAM_BUFFER_SIZE = 8*1024*1024;
+
+    /** Output columns */
+    public static final String OUTPUT_SORT_KEY = "00000_SORT_KEY";
+
+    public static final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN;
+    public static final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
+    public static final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
+    public static final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN;
+
+    public static final String OUTPUT_GENE_NAME = "geneName"; //eg. NDUFS2
+    public static final String OUTPUT_TRANSCRIPT_ACCESSION = "transcriptId"; //eg. NM_001212414.4
+
+    //public static final String OUTPUT_IN_TRANSCRIPT = "inTranscript"; //eg. true
+    public static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true
+
+    public static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2
+    public static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic
+
+    public static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript
+    public static final String OUTPUT_CODING_COORD = "codingCoord"; //1-based offset within the cds region
+
+    public static final String OUTPUT_SPLICE_DISTANCE = "minSpliceDistance"; //eg. 40
+
+    public static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20
+    public static final String OUTPUT_REFERENCE_CODON = "referenceCodon";
+    public static final String OUTPUT_REFERENCE_AA = "referenceAA";
+    public static final String OUTPUT_VARIANT_CODON = "variantCodon";
+    public static final String OUTPUT_VARIANT_AA = "variantAA";
+
+    public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
+
+    public static final String OUTPUT_CODING_COORD_STR = "coding_coord_str";
+    public static final String OUTPUT_PROTEIN_COORD_STR = "protein_coord_str";
+    public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
+
+    public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
+        OUTPUT_SORT_KEY,
+
+        OUTPUT_CHRPOS,
+        OUTPUT_HAPLOTYPE_REFERENCE,
+        OUTPUT_HAPLOTYPE_ALTERNATE,
+        OUTPUT_HAPLOTYPE_STRAND,
+
+        OUTPUT_GENE_NAME,
+        OUTPUT_TRANSCRIPT_ACCESSION,
+
+        OUTPUT_POSITION_TYPE,
+
+
+        OUTPUT_FRAME,
+        OUTPUT_MRNA_COORD,
+        OUTPUT_CODING_COORD,
+        OUTPUT_CODON_NUMBER,
+        OUTPUT_SPLICE_DISTANCE,
+
+        OUTPUT_REFERENCE_CODON,
+        OUTPUT_REFERENCE_AA,
+        OUTPUT_VARIANT_CODON,
+        OUTPUT_VARIANT_AA,
+        OUTPUT_CHANGES_AMINO_ACID,
+
+        OUTPUT_CODING_COORD_STR,
+        OUTPUT_PROTEIN_COORD_STR,
+        OUTPUT_INTRON_COORD_STR,
+
+        OUTPUT_IN_CODING_REGION,
+    };
+
+
+    /**
+     * Container for all data fields from a single refGene.txt row.
+     */
+    static class RefGeneTranscriptRecord
+    {
+        public static final String REFGENE_NAME = "name";  //eg. NM_021649
+        public static final String REFGENE_STRAND = "strand";  //eg. +
+        public static final String REFGENE_TXSTART = "txStart";
+        public static final String REFGENE_TXEND = "txEnd";
+        public static final String REFGENE_CDS_START = "cdsStart";
+        public static final String REFGENE_CDS_END = "cdsEnd";
+        public static final String REFGENE_EXON_COUNT = "exonCount";
+        public static final String REFGENE_EXON_STARTS = "exonStarts";
+        public static final String REFGENE_EXON_ENDS = "exonEnds";
+        public static final String REFGENE_EXON_FRAMES = "exonFrames";
+        public static final String REFGENE_NAME2 = "name2"; //eg. TICAM2
+
+
+        /**
+         * This StringBuffer accumulates the entire transcript sequence.
+         * This buffer is used instead of using the GATK window mechanism
+         * because arbitrary-length look-aheads and look-behinds are needed to deal
+         * with codons that span splice-junctions. The window mechanism
+         * requires hard-coding the window size, which would translate into a
+         * limit on maximum supported intron size. To avoid this, the
+         * sequence is accumulated as the transcript is scanned left-to-right.
+         * Then, all calculations are performed at the end.
+         */
+        public StringBuilder txSequence;  //the sequence of the entire transcript in order from 5' to 3'
+        public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3'
+
+        public boolean positiveStrand;
+        public String name; //eg. NM_021649
+        public String name2; //eg. TICAM2
+
+        public String txChrom; //The chromosome
+        public long txStart;
+        public long txEnd;
+
+        public long cdsStart;
+        public long cdsEnd;
+
+        public long[] exonStarts;
+        public long[] exonEnds;
+        public int[] exonFrames;
+
+        private AnnotatorROD rod;
+
+
+        public RefGeneTranscriptRecord(final AnnotatorROD refGeneRod) {
+            this.rod = refGeneRod;
+            //String binStr = rod.get("bin");
+            //String idStr = refGeneRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
+
+            String strandStr = refGeneRod.get("strand");
+            if(strandStr.equals("+")) {
+                positiveStrand = true;
+            } else if(strandStr.equals("-")) {
+                positiveStrand = false;
+            } else {
+                throw new IllegalArgumentException("refGene record contains unexpected strand value: \"" + strandStr + "\"");
+            }
+
+            name = refGeneRod.get("name");
+            name2 = refGeneRod.get("name2");
+
+            //String txStartStr = refGeneRod.get(REFGENE_TXSTART);  //These fields are used to generate column 1 of the ROD file (eg. they get turned into chr:txStart-txStop)
+            //String txEndStr = refGeneRod.get(REFGENE_TXEND);
+
+            final GenomeLoc loc = refGeneRod.getLocation();
+            txChrom = loc.getContig();
+            txStart = loc.getStart();
+            txEnd = loc.getStop();
+
+            txSequence = new StringBuilder( (int) (txEnd - txStart + 1) );  //the sequence of the entire transcript in order from 5' to 3'
+            cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
+
+            String cdsStartStr = refGeneRod.get(REFGENE_CDS_START);
+            String cdsEndStr = refGeneRod.get(REFGENE_CDS_END);
+
+            cdsStart = Long.parseLong(cdsStartStr);
+            cdsEnd = Long.parseLong(cdsEndStr);
+
+            String exonCountStr = refGeneRod.get(REFGENE_EXON_COUNT);
+            String exonStartsStr = refGeneRod.get(REFGENE_EXON_STARTS);
+            String exonEndsStr = refGeneRod.get(REFGENE_EXON_ENDS);
+            String exonFramesStr = refGeneRod.get(REFGENE_EXON_FRAMES);
+
+            String[] exonStartStrs = exonStartsStr.split(",");
+            String[] exonEndStrs = exonEndsStr.split(",");
+            String[] exonFrameStrs = exonFramesStr.split(",");
+
+            int exonCount = Integer.parseInt(exonCountStr);
+            if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length || exonCount != exonFrameStrs.length)
+            {
+                throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + ",  Exon frames: " + exonFramesStr + ", Exon count: " + exonCountStr +". refGeneRod = " + refGeneRod);
+            }
+
+            exonStarts = new long[exonCount];
+            exonEnds = new long[exonCount];
+            exonFrames = new int[exonCount];
+            for(int i = 0; i < exonCount; i++) {
+                exonStarts[i] = Long.parseLong(exonStartStrs[i]);
+                  exonEnds[i] = Long.parseLong(exonEndStrs[i]);
+                exonFrames[i] = Integer.parseInt(exonFrameStrs[i]);
+            }
+        }
+
+
+        /**
+         * Takes a genomic position on the same contig as the transcript, and
+         * returns true if this position falls within an exon.
+         */
+        public boolean isWithinExon(final long genomPosition) {
+            for(int i = 0; i < exonStarts.length; i++) {
+                final long curStart = exonStarts[i];
+                if(genomPosition < curStart) {
+                    return false;
+                }
+                final long curStop = exonEnds[i];
+                if(genomPosition <= curStop) {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        /**
+         * Computes the distance to the nearest splice-site.
+         * Returns 1 when down-stream of splice-juction, -1 when upstream
+         */
+        public int distanceToSpliceSite(final long genomPosition) {
+            int prevDistance = Integer.MAX_VALUE;
+            int curDistance;
+            for(int i = 0; i < exonStarts.length; i++) {
+                final long curStart = exonStarts[i];
+                curDistance = (int) (curStart-genomPosition);
+                if(genomPosition < curStart) {
+                    return Math.min(prevDistance, curDistance);
+                } else {
+                    prevDistance = (int) (genomPosition - curStart) + 1;
+                }
+
+                final long curStop = exonEnds[i];
+                curDistance = (int) (curStop-genomPosition) + 1;
+                if(genomPosition <= curStop) {
+                    return Math.min(prevDistance, curDistance);
+                } else {
+                    prevDistance = (int) (genomPosition - curStop);
+                }
+            }
+
+            return prevDistance; //out of exons. return genomPosition-curStop
+        }
+
+
+        /**
+         * Returns true if this is a coding transcript (eg. is translated
+         * into proteins). Returns false for non-coding RNA.
+         */
+        public boolean isProteinCodingTranscript() {
+            return cdsStart < cdsEnd;
+        }
+
+        @Override
+        public String toString() {
+            return rod.toString();
+        }
+
+    }
+
+
+    protected int transcriptsProcessedCounter = 0;
+
+
+    /** Possible values for the "POSITION_TYPE" output column. */
+    enum PositionType {
+        intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron
+    }
+
+    /** Output stream for computed results */
+    private PrintStream outputStream;
+
+
+    /**
+     * Prepare the output file and the list of available features.
+     */
+    public void initialize() {
+        //init the output file
+        String outputFilename = getToolkit().getArguments().outFileName;
+        if(outputFilename == null ) {
+            outputStream = System.out;
+        } else {
+            try {
+                outputStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFilename), OUTPUT_STREAM_BUFFER_SIZE));
+            } catch(Exception e) {
+                throw new StingException("Unable to open output file for writing: " + outputFilename);
+            }
+        }
+
+        outputStream.println(Utils.join(AnnotatorROD.DELIMITER, OUTPUT_COLUMN_NAMES));
+    }
+
+    /**
+     * Initialize the number of loci processed to zero.
+     *
+     * @return 0
+     */
+    public Integer reduceInit() { return 0; }
+
+
+    /**
+     * We want reads that span deletions
+     *
+     * @return true
+     */
+    public boolean includeReadsWithDeletionAtLoci() { return true; }
+
+
+
+    /**
+     * For each site of interest, generate the appropriate fields.
+     *
+     * @param tracker  the meta-data tracker
+     * @param ref      the reference base
+     * @param context  the context for the given locus
+     * @return 1 if the locus was successfully processed, 0 if otherwise
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( tracker == null )
+            return 0;
+
+
+        Collection<RODRecordList> rods = tracker.getBoundRodTracks();
+        if(rods.size() == 0) {
+            //if there's nothing overlapping this locus, skip it.
+            return 0;
+        }
+
+
+
+
+        final List<Object> refGeneRODs = (List<Object>) tracker.getReferenceMetaData("refgene");
+
+        for(Object refGeneRodObject : refGeneRODs) {
+            if(! (refGeneRodObject instanceof AnnotatorROD) ) {
+                throw new RuntimeException("The 'refgene' -B arg must have the type: 'AnnotatorInputTable'.");
+            }
+
+            AnnotatorROD refGeneRod = (AnnotatorROD) refGeneRodObject;
+            RefGeneTranscriptRecord parsedRefGeneRod = (RefGeneTranscriptRecord) refGeneRod.getTemporaryAttribute("parsedRefGeneRod");
+            if( parsedRefGeneRod == null ) {
+                parsedRefGeneRod = new RefGeneTranscriptRecord(refGeneRod);
+                refGeneRod.setTemporaryAttribute("parsedRefGeneRod", parsedRefGeneRod);
+            }
+
+            if(parsedRefGeneRod.positiveStrand) {
+                parsedRefGeneRod.txSequence.append(ref.getBase());
+            } else {
+                parsedRefGeneRod.txSequence.insert(0, ref.getBase());
+            }
+
+            final long position = ref.getLocus().getStart();
+            if(parsedRefGeneRod.isProteinCodingTranscript() && position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd && parsedRefGeneRod.isWithinExon(position) ) {
+                if(parsedRefGeneRod.positiveStrand) {
+                    parsedRefGeneRod.cdsSequence.append(ref.getBase());
+                } else {
+                    parsedRefGeneRod.cdsSequence.insert(0, ref.getBase());
+                }
+            }
+
+            if(position == parsedRefGeneRod.txEnd) {
+                //we've reached the end of the transcript - compute all data and write it out.
+                generateOutputRecordsForROD(parsedRefGeneRod);
+
+                transcriptsProcessedCounter++;
+                if(transcriptsProcessedCounter % 100 == 0) {
+                    System.err.println(new Date() + ": " +  transcriptsProcessedCounter + " transcripts processed");
+                }
+            }
+        }
+
+        return 1;
+    }
+
+
+    private static final char[] ALLELES = {'A','C','G','T'};
+
+
+
+    private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod) {
+
+        //Transcripts that don't produce proteins are indicated in refGene by cdsStart == cdsEnd
+        //These will be handled by generating only one record, with haplotypeAlternate == "*".
+        final boolean isProteinCodingTranscript = parsedRefGeneRod.isProteinCodingTranscript();
+
+        final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //shorten the name
+
+        if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
+            System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
+            //TODO figure out why all CDS lengths aren't in multiples of 3 nucleotides - Sarah says its ok to thow these out.
+            return;
+        }
+
+
+        //here, 5' and 3' are according to the DNA (not on the RNA)
+        int frame = 0; //the frame of the current position
+        int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
+        int proteinOffset_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
+        int codonOffset_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
+
+        char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
+
+
+        final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
+        final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
+        final int increment_5to3 = positiveStrand ? 1 : -1;
+        final int strandSign = increment_5to3; //alias
+
+        final long cdsStart_5prime = positiveStrand ? parsedRefGeneRod.cdsStart : parsedRefGeneRod.cdsEnd; //1-based, inclusive
+        final long cdsEnd_3prime = positiveStrand ? parsedRefGeneRod.cdsEnd : parsedRefGeneRod.cdsStart ; //1-based, inclusive
+
+        //compute chromosome key (later used to sort the output file)
+        boolean mitochondrial = false;
+        long chromKey;
+        final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
+        switch( Character.toLowerCase(lastChromChar) ) {
+        case 'm':
+            chromKey = 0;
+            mitochondrial = true;
+            break;
+        case 'x':
+            chromKey = 23;
+            break;
+        case 'y':
+            chromKey = 24;
+            break;
+        default:
+            chromKey = Integer.parseInt(parsedRefGeneRod.txChrom.substring(3));
+            break;
+        }
+
+        chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
+        chromKey *= 100000000000L;
+
+
+        for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
+        {
+            //get the reference sequence
+            final char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );
+
+            //whether the current txCoord_5to3 is within an exon
+            final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron
+
+            //figure out what region the current position is in.
+            PositionType positionType;
+            if(isProteinCodingTranscript)
+            {
+                if(isWithinExon)
+                {
+                    if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) {
+                        positionType = PositionType.utr5;
+                    } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) {
+                        positionType = PositionType.utr3;
+                    } else {
+                        positionType = PositionType.CDS;
+                    }
+                } else {
+                    positionType = PositionType.intron;
+                }
+            } else {
+                if(isWithinExon) {
+                    positionType = PositionType.non_coding_exon;
+                } else {
+                    positionType = PositionType.non_coding_intron;
+                }
+            }
+
+            //compute current codon
+            if(positionType == PositionType.CDS && frame == 0)
+            {
+                if(currentCodon_5to3 == null) {
+                    currentCodon_5to3 = new char[3];
+                }
+
+                codonOffset_from5++;
+
+                currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 - 1 ); //subtract 1 to go to zero-based coords
+                currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 );
+                currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 + 1);
+
+                if(!positiveStrand) {
+                    currentCodon_5to3[0] = BaseUtils.simpleComplement(currentCodon_5to3[0]);
+                    currentCodon_5to3[1] = BaseUtils.simpleComplement(currentCodon_5to3[1]);
+                    currentCodon_5to3[2] = BaseUtils.simpleComplement(currentCodon_5to3[2]);
+                }
+            }
+
+
+            final String distanceToSpliceSiteString = Integer.toString(parsedRefGeneRod.distanceToSpliceSite(txCoord_5to3));
+
+            for(final char haplotypeAlternate : ALLELES )
+            {
+                final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
+
+                outputLineFields.put(OUTPUT_SORT_KEY,  Long.toString( chromKey + txCoord_5to3) );
+
+                outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + txCoord_5to3);
+                outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( haplotypeReference ) );
+                outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, isProteinCodingTranscript ? Character.toString( haplotypeAlternate ) : "*");
+                outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
+                outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
+                outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
+
+                outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
+                outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
+                outputLineFields.put(OUTPUT_SPLICE_DISTANCE, distanceToSpliceSiteString );
+
+                if(isProteinCodingTranscript)
+                {
+                    outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
+
+                    if(isWithinExon) {
+                        outputLineFields.put(OUTPUT_CODING_COORD, Integer.toString(proteinOffset_from5) );
+
+                        if(positionType == PositionType.CDS) {
+                            final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
+                            outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) );
+                            outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonOffset_from5) );
+                            outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ Integer.toString(codonOffset_from5) );
+                            outputLineFields.put(OUTPUT_CODING_COORD_STR, "c."+ Integer.toString(proteinOffset_from5) );
+
+                            outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon );
+                            final String refAA = AminoAcidTable.getAA3LetterCode(referenceCodon, mitochondrial);
+                            outputLineFields.put(OUTPUT_REFERENCE_AA, refAA);
+
+                            final char temp = currentCodon_5to3[frame];
+                            currentCodon_5to3[frame] = haplotypeAlternate;
+                            final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
+                            final String variantAA = AminoAcidTable.getAA3LetterCode(variantCodon, mitochondrial);
+                            outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon );
+                            outputLineFields.put(OUTPUT_VARIANT_AA, variantAA);
+
+                            currentCodon_5to3[frame] = temp;
+
+                            outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.equals(variantAA)));
+                        }
+                    }
+                }
+
+
+                //write out the record
+                StringBuilder outputLine = new StringBuilder();
+                for( String column : OUTPUT_COLUMN_NAMES) {
+                    if(outputLine.length() != 0) {
+                        outputLine.append(AnnotatorROD.DELIMITER);
+                    }
+                    String value = outputLineFields.get(column);
+                    outputLine.append(value != null ? value : "");
+                }
+
+                outputStream.println(outputLine);
+
+                if( !isProteinCodingTranscript ) {
+                    //need only one record for this position, instead of 4 (one for each allele)
+                    break;
+                }
+
+            } //ALLELE for-loop
+
+
+            txOffset_from5++;
+
+            if(positionType == PositionType.CDS) {
+                proteinOffset_from5++;
+                frame = (frame + 1) % 3;
+            }
+
+
+
+        } // l for-loop
+
+    } //method close
+
+
+    /**
+     * Increment the number of loci processed.
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return the new number of loci processed.
+     */
+    public Integer reduce(Integer value, Integer sum) {
+        return sum + value;
+    }
+
+    /**
+     * Tell the user the number of loci processed and close out the new variants file.
+     *
+     * @param result  the number of loci seen.
+     */
+    public void onTraversalDone(Integer result) {
+        out.printf("Processed %d loci.\n", result);
+
+        outputStream.close();
+    }
+
+
+
+}
+