From e643a9e7a5b12910f5d092f4a3c497c7d2ea0531 Mon Sep 17 00:00:00 2001 From: weisburd Date: Thu, 22 Apr 2010 12:11:19 +0000 Subject: [PATCH] Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing annotations for each possible variant at each transcript position (eg. 4 variants for each position). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3238 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/annotator/TranscriptToInfo.java | 642 ++++++++++++++++++ 1 file changed, 642 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java new file mode 100755 index 000000000..d3c8706f8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java @@ -0,0 +1,642 @@ +package org.broadinstitute.sting.playground.gatk.walkers.annotator; + +import java.io.BufferedOutputStream; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Collection; +import java.util.Date; +import java.util.LinkedHashMap; +import java.util.List; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.AnnotatorROD; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; + + +/** + * Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing + * annotations for each possible variant at each transcript position (eg. 4 variants for each position). + */ +@Reference(window=@Window(start=-4,stop=4)) +@By(DataSource.REFERENCE) +@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="refgene",type=AnnotatorROD.class) }) +public class TranscriptToInfo extends RodWalker +{ + +/** + RefGene column names: + # bin 182 smallint(5) unsigned range Indexing field to speed chromosome range queries. + # name NM_021649 varchar(255) values Name of gene (usually transcript_id from GTF) + # chrom chr5 varchar(255) values Reference sequence chromosome or scaffold + # strand - char(1) values + or - for strand + # txStart 114942238 int(10) unsigned range Transcription start position + # txEnd 114966008 int(10) unsigned range Transcription end position + # cdsStart 114944144 int(10) unsigned range Coding region start + # cdsEnd 114944852 int(10) unsigned range Coding region end + # exonCount 2 int(10) unsigned range Number of exons + # exonStarts 114942238,114965692, longblob Exon start positions + # exonEnds 114944911,114966008, longblob Exon end positions + # id 0 int(10) unsigned range Unique identifier + # name2 TICAM2 varchar(255) values Alternate name (e.g. gene_id from GTF) + # cdsStartStat cmpl enum('none','unk','incmpl','cmpl') values enum('none','unk','incmpl','cmpl') + # cdsEndStat cmpl enum('none','unk','incmpl','cmpl') values enum('none','unk','incmpl','cmpl') + # exonFrames 0,-1, longblob Exon frame {0,1,2}, or -1 if no frame for exon +*/ + + + + public static final int OUTPUT_STREAM_BUFFER_SIZE = 8*1024*1024; + + /** Output columns */ + public static final String OUTPUT_SORT_KEY = "00000_SORT_KEY"; + + public static final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN; + public static final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN; + public static final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN; + public static final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN; + + public static final String OUTPUT_GENE_NAME = "geneName"; //eg. NDUFS2 + public static final String OUTPUT_TRANSCRIPT_ACCESSION = "transcriptId"; //eg. NM_001212414.4 + + //public static final String OUTPUT_IN_TRANSCRIPT = "inTranscript"; //eg. true + public static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true + + public static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2 + public static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic + + public static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript + public static final String OUTPUT_CODING_COORD = "codingCoord"; //1-based offset within the cds region + + public static final String OUTPUT_SPLICE_DISTANCE = "minSpliceDistance"; //eg. 40 + + public static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20 + public static final String OUTPUT_REFERENCE_CODON = "referenceCodon"; + public static final String OUTPUT_REFERENCE_AA = "referenceAA"; + public static final String OUTPUT_VARIANT_CODON = "variantCodon"; + public static final String OUTPUT_VARIANT_AA = "variantAA"; + + public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true + + public static final String OUTPUT_CODING_COORD_STR = "coding_coord_str"; + public static final String OUTPUT_PROTEIN_COORD_STR = "protein_coord_str"; + public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str"; + + public static final String[] OUTPUT_COLUMN_NAMES = new String[] { + OUTPUT_SORT_KEY, + + OUTPUT_CHRPOS, + OUTPUT_HAPLOTYPE_REFERENCE, + OUTPUT_HAPLOTYPE_ALTERNATE, + OUTPUT_HAPLOTYPE_STRAND, + + OUTPUT_GENE_NAME, + OUTPUT_TRANSCRIPT_ACCESSION, + + OUTPUT_POSITION_TYPE, + + + OUTPUT_FRAME, + OUTPUT_MRNA_COORD, + OUTPUT_CODING_COORD, + OUTPUT_CODON_NUMBER, + OUTPUT_SPLICE_DISTANCE, + + OUTPUT_REFERENCE_CODON, + OUTPUT_REFERENCE_AA, + OUTPUT_VARIANT_CODON, + OUTPUT_VARIANT_AA, + OUTPUT_CHANGES_AMINO_ACID, + + OUTPUT_CODING_COORD_STR, + OUTPUT_PROTEIN_COORD_STR, + OUTPUT_INTRON_COORD_STR, + + OUTPUT_IN_CODING_REGION, + }; + + + /** + * Container for all data fields from a single refGene.txt row. + */ + static class RefGeneTranscriptRecord + { + public static final String REFGENE_NAME = "name"; //eg. NM_021649 + public static final String REFGENE_STRAND = "strand"; //eg. + + public static final String REFGENE_TXSTART = "txStart"; + public static final String REFGENE_TXEND = "txEnd"; + public static final String REFGENE_CDS_START = "cdsStart"; + public static final String REFGENE_CDS_END = "cdsEnd"; + public static final String REFGENE_EXON_COUNT = "exonCount"; + public static final String REFGENE_EXON_STARTS = "exonStarts"; + public static final String REFGENE_EXON_ENDS = "exonEnds"; + public static final String REFGENE_EXON_FRAMES = "exonFrames"; + public static final String REFGENE_NAME2 = "name2"; //eg. TICAM2 + + + /** + * This StringBuffer accumulates the entire transcript sequence. + * This buffer is used instead of using the GATK window mechanism + * because arbitrary-length look-aheads and look-behinds are needed to deal + * with codons that span splice-junctions. The window mechanism + * requires hard-coding the window size, which would translate into a + * limit on maximum supported intron size. To avoid this, the + * sequence is accumulated as the transcript is scanned left-to-right. + * Then, all calculations are performed at the end. + */ + public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3' + public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3' + + public boolean positiveStrand; + public String name; //eg. NM_021649 + public String name2; //eg. TICAM2 + + public String txChrom; //The chromosome + public long txStart; + public long txEnd; + + public long cdsStart; + public long cdsEnd; + + public long[] exonStarts; + public long[] exonEnds; + public int[] exonFrames; + + private AnnotatorROD rod; + + + public RefGeneTranscriptRecord(final AnnotatorROD refGeneRod) { + this.rod = refGeneRod; + //String binStr = rod.get("bin"); + //String idStr = refGeneRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) + + String strandStr = refGeneRod.get("strand"); + if(strandStr.equals("+")) { + positiveStrand = true; + } else if(strandStr.equals("-")) { + positiveStrand = false; + } else { + throw new IllegalArgumentException("refGene record contains unexpected strand value: \"" + strandStr + "\""); + } + + name = refGeneRod.get("name"); + name2 = refGeneRod.get("name2"); + + //String txStartStr = refGeneRod.get(REFGENE_TXSTART); //These fields are used to generate column 1 of the ROD file (eg. they get turned into chr:txStart-txStop) + //String txEndStr = refGeneRod.get(REFGENE_TXEND); + + final GenomeLoc loc = refGeneRod.getLocation(); + txChrom = loc.getContig(); + txStart = loc.getStart(); + txEnd = loc.getStop(); + + txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' + cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns + + String cdsStartStr = refGeneRod.get(REFGENE_CDS_START); + String cdsEndStr = refGeneRod.get(REFGENE_CDS_END); + + cdsStart = Long.parseLong(cdsStartStr); + cdsEnd = Long.parseLong(cdsEndStr); + + String exonCountStr = refGeneRod.get(REFGENE_EXON_COUNT); + String exonStartsStr = refGeneRod.get(REFGENE_EXON_STARTS); + String exonEndsStr = refGeneRod.get(REFGENE_EXON_ENDS); + String exonFramesStr = refGeneRod.get(REFGENE_EXON_FRAMES); + + String[] exonStartStrs = exonStartsStr.split(","); + String[] exonEndStrs = exonEndsStr.split(","); + String[] exonFrameStrs = exonFramesStr.split(","); + + int exonCount = Integer.parseInt(exonCountStr); + if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length || exonCount != exonFrameStrs.length) + { + throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + ", Exon frames: " + exonFramesStr + ", Exon count: " + exonCountStr +". refGeneRod = " + refGeneRod); + } + + exonStarts = new long[exonCount]; + exonEnds = new long[exonCount]; + exonFrames = new int[exonCount]; + for(int i = 0; i < exonCount; i++) { + exonStarts[i] = Long.parseLong(exonStartStrs[i]); + exonEnds[i] = Long.parseLong(exonEndStrs[i]); + exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); + } + } + + + /** + * Takes a genomic position on the same contig as the transcript, and + * returns true if this position falls within an exon. + */ + public boolean isWithinExon(final long genomPosition) { + for(int i = 0; i < exonStarts.length; i++) { + final long curStart = exonStarts[i]; + if(genomPosition < curStart) { + return false; + } + final long curStop = exonEnds[i]; + if(genomPosition <= curStop) { + return true; + } + } + + return false; + } + + /** + * Computes the distance to the nearest splice-site. + * Returns 1 when down-stream of splice-juction, -1 when upstream + */ + public int distanceToSpliceSite(final long genomPosition) { + int prevDistance = Integer.MAX_VALUE; + int curDistance; + for(int i = 0; i < exonStarts.length; i++) { + final long curStart = exonStarts[i]; + curDistance = (int) (curStart-genomPosition); + if(genomPosition < curStart) { + return Math.min(prevDistance, curDistance); + } else { + prevDistance = (int) (genomPosition - curStart) + 1; + } + + final long curStop = exonEnds[i]; + curDistance = (int) (curStop-genomPosition) + 1; + if(genomPosition <= curStop) { + return Math.min(prevDistance, curDistance); + } else { + prevDistance = (int) (genomPosition - curStop); + } + } + + return prevDistance; //out of exons. return genomPosition-curStop + } + + + /** + * Returns true if this is a coding transcript (eg. is translated + * into proteins). Returns false for non-coding RNA. + */ + public boolean isProteinCodingTranscript() { + return cdsStart < cdsEnd; + } + + @Override + public String toString() { + return rod.toString(); + } + + } + + + protected int transcriptsProcessedCounter = 0; + + + /** Possible values for the "POSITION_TYPE" output column. */ + enum PositionType { + intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron + } + + /** Output stream for computed results */ + private PrintStream outputStream; + + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + //init the output file + String outputFilename = getToolkit().getArguments().outFileName; + if(outputFilename == null ) { + outputStream = System.out; + } else { + try { + outputStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFilename), OUTPUT_STREAM_BUFFER_SIZE)); + } catch(Exception e) { + throw new StingException("Unable to open output file for writing: " + outputFilename); + } + } + + outputStream.println(Utils.join(AnnotatorROD.DELIMITER, OUTPUT_COLUMN_NAMES)); + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + + /** + * We want reads that span deletions + * + * @return true + */ + public boolean includeReadsWithDeletionAtLoci() { return true; } + + + + /** + * For each site of interest, generate the appropriate fields. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + + Collection rods = tracker.getBoundRodTracks(); + if(rods.size() == 0) { + //if there's nothing overlapping this locus, skip it. + return 0; + } + + + + + final List refGeneRODs = (List) tracker.getReferenceMetaData("refgene"); + + for(Object refGeneRodObject : refGeneRODs) { + if(! (refGeneRodObject instanceof AnnotatorROD) ) { + throw new RuntimeException("The 'refgene' -B arg must have the type: 'AnnotatorInputTable'."); + } + + AnnotatorROD refGeneRod = (AnnotatorROD) refGeneRodObject; + RefGeneTranscriptRecord parsedRefGeneRod = (RefGeneTranscriptRecord) refGeneRod.getTemporaryAttribute("parsedRefGeneRod"); + if( parsedRefGeneRod == null ) { + parsedRefGeneRod = new RefGeneTranscriptRecord(refGeneRod); + refGeneRod.setTemporaryAttribute("parsedRefGeneRod", parsedRefGeneRod); + } + + if(parsedRefGeneRod.positiveStrand) { + parsedRefGeneRod.txSequence.append(ref.getBase()); + } else { + parsedRefGeneRod.txSequence.insert(0, ref.getBase()); + } + + final long position = ref.getLocus().getStart(); + if(parsedRefGeneRod.isProteinCodingTranscript() && position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd && parsedRefGeneRod.isWithinExon(position) ) { + if(parsedRefGeneRod.positiveStrand) { + parsedRefGeneRod.cdsSequence.append(ref.getBase()); + } else { + parsedRefGeneRod.cdsSequence.insert(0, ref.getBase()); + } + } + + if(position == parsedRefGeneRod.txEnd) { + //we've reached the end of the transcript - compute all data and write it out. + generateOutputRecordsForROD(parsedRefGeneRod); + + transcriptsProcessedCounter++; + if(transcriptsProcessedCounter % 100 == 0) { + System.err.println(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed"); + } + } + } + + return 1; + } + + + private static final char[] ALLELES = {'A','C','G','T'}; + + + + private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod) { + + //Transcripts that don't produce proteins are indicated in refGene by cdsStart == cdsEnd + //These will be handled by generating only one record, with haplotypeAlternate == "*". + final boolean isProteinCodingTranscript = parsedRefGeneRod.isProteinCodingTranscript(); + + final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //shorten the name + + if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) { + System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString()); + //TODO figure out why all CDS lengths aren't in multiples of 3 nucleotides - Sarah says its ok to thow these out. + return; + } + + + //here, 5' and 3' are according to the DNA (not on the RNA) + int frame = 0; //the frame of the current position + int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand + int proteinOffset_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand + int codonOffset_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.) + + char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3' + + + final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive + final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive + final int increment_5to3 = positiveStrand ? 1 : -1; + final int strandSign = increment_5to3; //alias + + final long cdsStart_5prime = positiveStrand ? parsedRefGeneRod.cdsStart : parsedRefGeneRod.cdsEnd; //1-based, inclusive + final long cdsEnd_3prime = positiveStrand ? parsedRefGeneRod.cdsEnd : parsedRefGeneRod.cdsStart ; //1-based, inclusive + + //compute chromosome key (later used to sort the output file) + boolean mitochondrial = false; + long chromKey; + final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1); + switch( Character.toLowerCase(lastChromChar) ) { + case 'm': + chromKey = 0; + mitochondrial = true; + break; + case 'x': + chromKey = 23; + break; + case 'y': + chromKey = 24; + break; + default: + chromKey = Integer.parseInt(parsedRefGeneRod.txChrom.substring(3)); + break; + } + + chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step) + chromKey *= 100000000000L; + + + for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) + { + //get the reference sequence + final char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 ); + + //whether the current txCoord_5to3 is within an exon + final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron + + //figure out what region the current position is in. + PositionType positionType; + if(isProteinCodingTranscript) + { + if(isWithinExon) + { + if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { + positionType = PositionType.utr5; + } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { + positionType = PositionType.utr3; + } else { + positionType = PositionType.CDS; + } + } else { + positionType = PositionType.intron; + } + } else { + if(isWithinExon) { + positionType = PositionType.non_coding_exon; + } else { + positionType = PositionType.non_coding_intron; + } + } + + //compute current codon + if(positionType == PositionType.CDS && frame == 0) + { + if(currentCodon_5to3 == null) { + currentCodon_5to3 = new char[3]; + } + + codonOffset_from5++; + + currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 - 1 ); //subtract 1 to go to zero-based coords + currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 ); + currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 + 1); + + if(!positiveStrand) { + currentCodon_5to3[0] = BaseUtils.simpleComplement(currentCodon_5to3[0]); + currentCodon_5to3[1] = BaseUtils.simpleComplement(currentCodon_5to3[1]); + currentCodon_5to3[2] = BaseUtils.simpleComplement(currentCodon_5to3[2]); + } + } + + + final String distanceToSpliceSiteString = Integer.toString(parsedRefGeneRod.distanceToSpliceSite(txCoord_5to3)); + + for(final char haplotypeAlternate : ALLELES ) + { + final LinkedHashMap outputLineFields = new LinkedHashMap(); + + outputLineFields.put(OUTPUT_SORT_KEY, Long.toString( chromKey + txCoord_5to3) ); + + outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + txCoord_5to3); + outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( haplotypeReference ) ); + outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, isProteinCodingTranscript ? Character.toString( haplotypeAlternate ) : "*"); + outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-"); + outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 ); + outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name ); + + outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); + outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) ); + outputLineFields.put(OUTPUT_SPLICE_DISTANCE, distanceToSpliceSiteString ); + + if(isProteinCodingTranscript) + { + outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); + + if(isWithinExon) { + outputLineFields.put(OUTPUT_CODING_COORD, Integer.toString(proteinOffset_from5) ); + + if(positionType == PositionType.CDS) { + final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; + outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) ); + outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonOffset_from5) ); + outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ Integer.toString(codonOffset_from5) ); + outputLineFields.put(OUTPUT_CODING_COORD_STR, "c."+ Integer.toString(proteinOffset_from5) ); + + outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon ); + final String refAA = AminoAcidTable.getAA3LetterCode(referenceCodon, mitochondrial); + outputLineFields.put(OUTPUT_REFERENCE_AA, refAA); + + final char temp = currentCodon_5to3[frame]; + currentCodon_5to3[frame] = haplotypeAlternate; + final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; + final String variantAA = AminoAcidTable.getAA3LetterCode(variantCodon, mitochondrial); + outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon ); + outputLineFields.put(OUTPUT_VARIANT_AA, variantAA); + + currentCodon_5to3[frame] = temp; + + outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.equals(variantAA))); + } + } + } + + + //write out the record + StringBuilder outputLine = new StringBuilder(); + for( String column : OUTPUT_COLUMN_NAMES) { + if(outputLine.length() != 0) { + outputLine.append(AnnotatorROD.DELIMITER); + } + String value = outputLineFields.get(column); + outputLine.append(value != null ? value : ""); + } + + outputStream.println(outputLine); + + if( !isProteinCodingTranscript ) { + //need only one record for this position, instead of 4 (one for each allele) + break; + } + + } //ALLELE for-loop + + + txOffset_from5++; + + if(positionType == PositionType.CDS) { + proteinOffset_from5++; + frame = (frame + 1) % 3; + } + + + + } // l for-loop + + } //method close + + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); + + outputStream.close(); + } + + + +} +