Finished implementing

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3273 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
weisburd 2010-04-30 14:14:31 +00:00
parent 5d5c7f9d34
commit ba78d146ec
1 changed files with 533 additions and 234 deletions

View File

@ -1,12 +1,16 @@
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -19,21 +23,25 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
/**
* Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
* Takes a UCSC refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
* annotations for each possible variant at each transcript position (eg. 4 variants for each position).
*
* The map & reduce types are both TreeMap<String, String>
* TreeMap entries represent lines in the output file. The TreeMap key is a combination of a given output line's position (so that this key can be used to sort all output lines
* by reference order), as well as allele and gene name (so that its unique across all output lines). The String value is the output line itself.
*/
@Reference(window=@Window(start=-4,stop=4))
@By(DataSource.REFERENCE)
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="refgene",type=AnnotatorROD.class) })
public class TranscriptToInfo extends RodWalker<Integer, Integer>
public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap<String, String>> implements TreeReducible<TreeMap<String, String>>
{
/**
@ -57,12 +65,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
*/
public static final int OUTPUT_STREAM_BUFFER_SIZE = 8*1024*1024;
/** Output columns */
public static final String OUTPUT_SORT_KEY = "00000_SORT_KEY";
public static final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN;
public static final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
public static final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
@ -89,18 +92,17 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
public static final String OUTPUT_VARIANT_AA = "variantAA";
public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
public static final String OUTPUT_FUNCTIONAL_CLASS = "functionalClass"; //eg. missense
public static final String OUTPUT_CODING_COORD_STR = "codingCoordStr";
public static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr";
//public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
public static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary
public static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5)
public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
OUTPUT_SORT_KEY,
OUTPUT_CHRPOS,
OUTPUT_HAPLOTYPE_REFERENCE,
@ -124,6 +126,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
OUTPUT_VARIANT_CODON,
OUTPUT_VARIANT_AA,
OUTPUT_CHANGES_AMINO_ACID,
OUTPUT_FUNCTIONAL_CLASS,
OUTPUT_CODING_COORD_STR,
OUTPUT_PROTEIN_COORD_STR,
@ -137,7 +140,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
/**
* Container for all data fields from a single refGene.txt row.
*/
static class RefGeneTranscriptRecord
protected static class RefGeneTranscriptRecord
{
public static final String REFGENE_NAME = "name"; //eg. NM_021649
public static final String REFGENE_STRAND = "strand"; //eg. +
@ -163,6 +166,7 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
* Then, all calculations are performed at the end.
*/
public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3'
public StringBuilder utr5Sequence; //the protein coding sequence (with introns removed) in order from 5' to 3'
public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3'
public boolean positiveStrand;
@ -187,14 +191,15 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
this.rod = refGeneRod;
//String binStr = rod.get("bin");
//String idStr = refGeneRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
String strandStr = refGeneRod.get("strand");
if(strandStr.equals("+")) {
if(strandStr == null) {
throw new IllegalArgumentException("refGene record doesn't contain a 'strand' column. Make sure refGene input file has a header and the usual columns: \"" + strandStr + "\"");
} else if(strandStr.equals("+")) {
positiveStrand = true;
} else if(strandStr.equals("-")) {
positiveStrand = false;
} else {
throw new IllegalArgumentException("refGene record contains unexpected strand value: \"" + strandStr + "\"");
throw new IllegalArgumentException("refGene record contains unexpected value for 'strand' column: \"" + strandStr + "\"");
}
name = refGeneRod.get("name");
@ -208,15 +213,18 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
txStart = loc.getStart();
txEnd = loc.getStop();
txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3'
cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
String cdsStartStr = refGeneRod.get(REFGENE_CDS_START);
String cdsEndStr = refGeneRod.get(REFGENE_CDS_END);
cdsStart = Long.parseLong(cdsStartStr);
cdsEnd = Long.parseLong(cdsEndStr);
txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3'
if(isProteinCodingTranscript()) {
utr5Sequence = new StringBuilder( positiveStrand ? (int) (cdsStart - txStart + 1) : (int) (txEnd - cdsEnd + 1) ); //TODO reduce init size by size of introns
cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
}
String exonCountStr = refGeneRod.get(REFGENE_EXON_COUNT);
String exonStartsStr = refGeneRod.get(REFGENE_EXON_STARTS);
String exonEndsStr = refGeneRod.get(REFGENE_EXON_ENDS);
@ -380,6 +388,8 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
}
protected String outputFilename = null;
protected int transcriptsProcessedCounter = 0;
@ -388,44 +398,37 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron
}
/** Output stream for computed results */
private PrintStream outputStream;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
//init the output file
String outputFilename = getToolkit().getArguments().outFileName;
if(outputFilename == null ) {
outputStream = System.out;
} else {
try {
outputStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFilename), OUTPUT_STREAM_BUFFER_SIZE));
} catch(Exception e) {
throw new StingException("Unable to open output file for writing: " + outputFilename);
}
outputFilename = getToolkit().getArguments().outFileName;
if(outputFilename == null) {
throw new StingException("Output file not specified. (Used -o command-line-arg)" );
}
if(!new File(outputFilename).canWrite()) {
throw new StingException("Unable to write to output file: " + outputFilename );
}
outputStream.println(Utils.join(AnnotatorROD.DELIMITER, OUTPUT_COLUMN_NAMES));
}
/**
* Initialize the number of loci processed to zero.
* Initialize the output filename.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
/**
* We want reads that span deletions
*
* @return true
*/
public boolean includeReadsWithDeletionAtLoci() { return true; }
public TreeMap<String, String> reduceInit()
{
return new TreeMap<String, String>() {
public String toString() {
return size() + " total entries";
}
};
}
/**
@ -436,18 +439,19 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
public TreeMap<String, String> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
return null;
final Collection<RODRecordList> rods = tracker.getBoundRodTracks();
if(rods.size() == 0) {
//if there's nothing overlapping this locus, skip it.
return 0;
return null;
}
final TreeMap<String, String> result = new TreeMap<String, String>();
final List<Object> refGeneRODs = (List<Object>) tracker.getReferenceMetaData("refgene");
@ -464,57 +468,105 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
refGeneRod.setTemporaryAttribute("parsedRefGeneRod", parsedRefGeneRod);
}
//populate the parsedRefGeneRod.positiveStrand data structure
if(parsedRefGeneRod.positiveStrand) {
parsedRefGeneRod.txSequence.append(ref.getBase());
} else {
parsedRefGeneRod.txSequence.insert(0, ref.getBase());
}
//continue populating the parsedRefGeneRod.utr5Sequence and parsedRefGeneRod.cdsSequence data structures
final long position = ref.getLocus().getStart();
if(parsedRefGeneRod.isProteinCodingTranscript() && position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd && parsedRefGeneRod.isWithinExon(position) ) {
if(parsedRefGeneRod.positiveStrand) {
parsedRefGeneRod.cdsSequence.append(ref.getBase());
} else {
parsedRefGeneRod.cdsSequence.insert(0, ref.getBase());
if(parsedRefGeneRod.isProteinCodingTranscript() && parsedRefGeneRod.isWithinExon(position) )
{
//we're within an exon of a proteinCodingTranscript
if(parsedRefGeneRod.positiveStrand)
{
if(position < parsedRefGeneRod.cdsStart)
{
parsedRefGeneRod.utr5Sequence.append(ref.getBase()); //within utr5
}
else if(position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd)
{
parsedRefGeneRod.cdsSequence.append(ref.getBase()); //within CDS
}
}
else
{
if(position > parsedRefGeneRod.cdsEnd)
{
//As we more left-to-right (aka. 3' to 5'), we do insert(0,..) to reverse the sequence so that it become 5' to 3' in parsedRefGeneRod.utr5Sequence.
parsedRefGeneRod.utr5Sequence.insert(0,ref.getBase()); //within utr5.
}
else if(position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd)
{
parsedRefGeneRod.cdsSequence.insert(0,ref.getBase()); //within CDS
}
}
}
if(position == parsedRefGeneRod.txEnd) {
if(position == parsedRefGeneRod.txEnd)
{
//we've reached the end of the transcript - compute all data and write it out.
generateOutputRecordsForROD(parsedRefGeneRod);
try
{
generateOutputRecordsForROD(parsedRefGeneRod, result);
}
catch(Exception e)
{
logger.error(Thread.currentThread().getName() + " - Unexpected error occurred at position: [" + parsedRefGeneRod.txChrom + ":" + position + "] in transcript: " + parsedRefGeneRod, e);
}
transcriptsProcessedCounter++;
if(transcriptsProcessedCounter % 100 == 0) {
System.err.println(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed");
logger.info(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed");
}
}
}
return 1;
return result;
}
private static final char[] ALLELES = {'A','C','G','T'};
private static final String OUTPUT_HEADER_LINE;
static {
StringBuilder outputHeaderLine = new StringBuilder();
for( final String column : OUTPUT_COLUMN_NAMES ) {
if(outputHeaderLine.length() != 0) {
outputHeaderLine.append( AnnotatorROD.DELIMITER );
}
outputHeaderLine.append(column);
}
OUTPUT_HEADER_LINE = outputHeaderLine.toString();
}
private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod) {
/**
* Returns the filename.
*
* @param parsedRefGeneRod
* @param result
* @return
*/
private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod, TreeMap<String, String> result) throws IOException
{
//Transcripts that don't produce proteins are indicated in refGene by cdsStart == cdsEnd
//These will be handled by generating only one record, with haplotypeAlternate == "*".
final boolean isProteinCodingTranscript = parsedRefGeneRod.isProteinCodingTranscript();
final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //shorten the name
final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //alias
if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
logger.warn("WARNING: Transcript " + parsedRefGeneRod.name +" at position ["+ parsedRefGeneRod.txChrom + ":" +parsedRefGeneRod.txStart + "-" + parsedRefGeneRod.txEnd + "] has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping...");
//discard transcripts where CDS length is not a multiple of 3
return;
}
char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement
@ -525,67 +577,52 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
int frame = 0; //the frame of the current position
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
int codonCount_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
int codingCoord_from5 = isProteinCodingTranscript ? parsedRefGeneRod.computeInitialCodingCoord() : 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
int utr5Count_from5 = 0;
char[] utr5NucBuffer_5to3 = null; //used to find uORFs - size = 5 because to hold the 3 codons that overlap any given position: [-2,-1,0], [-1,0,1], and [0,1,2]
int codonCount_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons
int codingCoord_from5 = isProteinCodingTranscript ? parsedRefGeneRod.computeInitialCodingCoord() : -1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
boolean codingCoordResetForCDS = false;
boolean codingCoordResetForUtr3 = false;
final char[] currentCodon_5to3 = isProteinCodingTranscript ? new char[3] : null; //holds the current RNA codon - 5' to 3'
boolean mitochondrial = false;
//compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
long chromKey;
final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
switch( Character.toLowerCase(lastChromChar) ) {
case 'm':
chromKey = 0;
mitochondrial = true;
break;
case 'x':
chromKey = 23;
break;
case 'y':
chromKey = 24;
break;
default:
chromKey = Integer.parseInt(parsedRefGeneRod.txChrom.substring(3));
break;
}
chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
chromKey *= 100000000000L;
final boolean mitochondrial = parsedRefGeneRod.txChrom.toLowerCase().startsWith("chrm");
PositionType positionType = null;
boolean isWithinIntronAndFarFromSpliceJunction = false;
long intronStart_5prime = -1;
long intronEnd_5prime = -1;
final Map<String, String> outputLineFields = new HashMap<String, String>();
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
{
//get the reference sequence
final char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );
//whether the current txCoord_5to3 is within an exon
//compute certain attributes of the current position
final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron
final int distanceToNearestSpliceSite = parsedRefGeneRod.computeDistanceToNearestSpliceSite(txCoord_5to3);
final boolean isWithin10bpOfSpliceJunction = Math.abs(distanceToNearestSpliceSite) <= 10;
//figure out what region the current position is in.
//increment coding coord is necessary
if(isWithinExon) {
codingCoord_from5++;
}
//figure out the current positionType
final PositionType prevPositionType = positionType; //save the position before it is updated
if(isProteinCodingTranscript)
{
if(isWithinExon)
{
//update positionType (and codingCoord_from5 if needed)
if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //multiplying by strandSign is like doing absolute value.
if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //utr5 (multiplying by strandSign is like doing absolute value.)
positionType = PositionType.utr5;
} else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //multiplying by strandSign is like doing absolute value.
if(positionType != PositionType.utr3) {
//if we're transitioning from CDS to utr3, reset the coding coord to 1.
codingCoord_from5 = 1; //reset the coding coord for utr3
} else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //utr3 (multiplying by strandSign is like doing absolute value.)
positionType = PositionType.utr3;
}
} else {
if(positionType != PositionType.CDS) {
//if we're transitioning from utr5 to CDS, reset the coding coord from -1 to 1.
codingCoord_from5 = 1;
positionType = PositionType.CDS;
}
}
} else {
positionType = PositionType.intron;
}
@ -597,15 +634,119 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
}
}
//compute current codon
if(positionType == PositionType.CDS && frame == 0)
{
if(currentCodon_5to3 == null) {
currentCodon_5to3 = new char[3];
//handle transitions
if(positionType == PositionType.CDS && prevPositionType != PositionType.CDS && !codingCoordResetForCDS) {
//transitioning from utr5 to CDS, reset the coding coord from -1 to 1.
codingCoord_from5 = 1;
codingCoordResetForCDS = true;
} else if(positionType == PositionType.utr3 && prevPositionType != PositionType.utr3 && !codingCoordResetForUtr3) {
//transitioning from CDS to utr3, reset the coding coord to 1.
codingCoord_from5 = 1;
codingCoordResetForUtr3 = true;
}
codonCount_from5++;
try
{
//handle introns
boolean wasWithinIntronAndFarFromSpliceJunction = isWithinIntronAndFarFromSpliceJunction;
isWithinIntronAndFarFromSpliceJunction = !isWithinExon && !isWithin10bpOfSpliceJunction;
if(!wasWithinIntronAndFarFromSpliceJunction && isWithinIntronAndFarFromSpliceJunction) {
//save intron start
intronStart_5prime = txCoord_5to3;
} else if(wasWithinIntronAndFarFromSpliceJunction && !isWithinIntronAndFarFromSpliceJunction) {
//output intron record
intronEnd_5prime = txCoord_5to3 - increment_5to3;
final long intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ;
final long intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime);
outputLineFields.clear();
outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + intronStart + "-" + intronEnd);
outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) );
outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, Character.toString( '*' ) );
outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
if(isProteinCodingTranscript)
{
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
}
final String sortKey = computeSortKey(parsedRefGeneRod.txChrom, intronStart, '*', parsedRefGeneRod);
addThisLineToResult(sortKey, outputLineFields, result);
}
//when in utr5, compute the utr5NucBuffer_5to3 which is later used to compute the OUTPUT_UORF_CHANGE field
if(positionType == PositionType.utr5)
{
if(utr5Count_from5 < parsedRefGeneRod.utr5Sequence.length())
{
if(utr5NucBuffer_5to3 == null) {
//initialize
utr5NucBuffer_5to3 = new char[5];
utr5NucBuffer_5to3[3] = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 );
if(!positiveStrand) {
utr5NucBuffer_5to3[3] = BaseUtils.simpleComplement(utr5NucBuffer_5to3[3]);
}
if(utr5Count_from5 + 1 < parsedRefGeneRod.utr5Sequence.length() ) {
utr5NucBuffer_5to3[4] = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 + 1 );
if(!positiveStrand) {
utr5NucBuffer_5to3[4] = BaseUtils.simpleComplement(utr5NucBuffer_5to3[4]);
}
}
}
//as we move 5' to 3', shift nucleotides down to the 5' end, making room for the new 3' nucleotide:
utr5NucBuffer_5to3[0] = utr5NucBuffer_5to3[1];
utr5NucBuffer_5to3[1] = utr5NucBuffer_5to3[2];
utr5NucBuffer_5to3[2] = utr5NucBuffer_5to3[3];
utr5NucBuffer_5to3[3] = utr5NucBuffer_5to3[4];
char nextRefBase = 0;
if( utr5Count_from5 + 2 < parsedRefGeneRod.utr5Sequence.length() )
{
nextRefBase = parsedRefGeneRod.utr5Sequence.charAt( utr5Count_from5 + 2 );
//reverse complement if necessary
if(!positiveStrand) {
nextRefBase = BaseUtils.simpleComplement(nextRefBase);
}
}
utr5NucBuffer_5to3[4] = nextRefBase;
//check for bad bases
if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) ||
(utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) ||
(utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) ||
(utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) ||
(utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4])))
{
logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedRefGeneRod);
continue;
}
} else { // if(utr5Count_from5 >= parsedRefGeneRod.utr5Sequence.length())
//defensive programming
logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". utr5Count_from5 is now " + utr5Count_from5 + ", while parsedRefGeneRod.utr5Sequence.length() == " + parsedRefGeneRod.utr5Sequence.length() + ". This means parsedRefGeneRod.utr5Sequence isn't as long as it should be. This is a bug in handling this record: " + parsedRefGeneRod);
continue;
}
}
//when in CDS, compute current codon
if(positionType == PositionType.CDS)
{
if(frame == 0)
{
currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords
currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 );
currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( codingCoord_from5 + 1);
@ -617,15 +758,34 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
}
}
for(final char haplotypeAlternate : ALLELES )
//check for bad bases
if(!BaseUtils.isRegularBase(currentCodon_5to3[0]) || !BaseUtils.isRegularBase(currentCodon_5to3[1]) || !BaseUtils.isRegularBase(currentCodon_5to3[2])) {
logger.error("Exception: Skipping current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name +". CDS codon contains irregular base:" + currentCodon_5to3[0] + currentCodon_5to3[1] + currentCodon_5to3[2]);// +". Transcript is: " + parsedRefGeneRod);
continue;
}
}
char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );
if(!BaseUtils.isRegularBase(haplotypeReference)) {
//check for bad bases
logger.error("Exception: Current position [" + parsedRefGeneRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedRefGeneRod.name + " contains an irregular base:" + haplotypeReference); // +". Transcript is: " + parsedRefGeneRod);
continue;
}
for(char haplotypeAlternate : ALLELES )
{
final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
outputLineFields.clear();
outputLineFields.put(OUTPUT_SORT_KEY, Long.toString( chromKey + txCoord_5to3) );
if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) {
haplotypeReference = '*';
haplotypeAlternate = '*';
}
//compute simple OUTPUT fields.
outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + txCoord_5to3);
outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( haplotypeReference ) );
outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, isProteinCodingTranscript ? Character.toString( haplotypeAlternate ) : "*");
outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, Character.toString( haplotypeAlternate ) );
outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
@ -634,9 +794,9 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) );
//comppute OUTPUT_SPLICE_INFO
//compute OUTPUT_SPLICE_INFO
final String spliceInfoString;
if(Math.abs(distanceToNearestSpliceSite) <= 10) {
if(isWithin10bpOfSpliceJunction) {
if(distanceToNearestSpliceSite < 0) {
//is on the 5' side of the splice junction
if(isWithinExon) {
@ -654,19 +814,42 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString);
}
//compute OUTPUT_IN_CODING_REGION
if(isProteinCodingTranscript)
{
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
}
if(isWithinExon)
{
//compute OUTPUT_UORF_CHANGE
if(positionType == PositionType.utr5)
{
/*
if(THIS IS A uORF)
outputLineFields.put(OUTPUT_UORF_CHANGE, "TODO" ); //TODO put this here
*/
String refCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + utr5NucBuffer_5to3[2]).toLowerCase();
String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toLowerCase();
String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toLowerCase();
String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternate).toLowerCase();
String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternate) + utr5NucBuffer_5to3[3]).toLowerCase();
String varCodon3 = (Character.toString(haplotypeAlternate) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toLowerCase();
//check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF)
String uORFChangeStr = null;
if( (refCodon1.equals("atg") && !varCodon1.equals("atg")) ||
(refCodon2.equals("atg") && !varCodon2.equals("atg")) ||
(refCodon3.equals("atg") && !varCodon3.equals("atg")))
{
uORFChangeStr = "-1";
}
else if((varCodon1.equals("atg") && !refCodon1.equals("atg")) ||
(varCodon2.equals("atg") && !refCodon2.equals("atg")) ||
(varCodon3.equals("atg") && !refCodon3.equals("atg")))
{
uORFChangeStr = "+1";
}
outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr );
}
//compute CDS-specific fields
else if(positionType == PositionType.CDS)
{
final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
@ -688,11 +871,26 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
currentCodon_5to3[frame] = temp;
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.getLetter().equals(variantAA.getLetter())));
boolean changesAA = !refAA.equals(variantAA);
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA));
final String functionalClass;
if(changesAA) {
if(variantAA.isStop()) {
functionalClass ="nonsense";
} else {
functionalClass ="missense";
}
} else {
functionalClass ="silent";
}
outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass);
}
//compute OUTPUT_CODING_COORD_STR
if(isProteinCodingTranscript)
{
//compute coding coord
final StringBuilder codingCoordStr = new StringBuilder();
codingCoordStr.append( mitochondrial ? "m." : "c." );
if(positionType == PositionType.utr3) {
@ -701,79 +899,180 @@ public class TranscriptToInfo extends RodWalker<Integer, Integer>
if(isWithinExon) {
codingCoordStr.append( Integer.toString(codingCoord_from5) );
codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
} else {
//intronic coordinates
if(distanceToNearestSpliceSite < 0) {
codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) );
} else {
codingCoordStr.append( Integer.toString(codingCoord_from5 ) );
codingCoordStr.append( "+" );
}
codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) );
}
codingCoordStr.append( haplotypeReference + ">" + haplotypeAlternate);
outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString() );
outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString());
}
//write out the record
final StringBuilder outputLine = new StringBuilder();
for( final String column : OUTPUT_COLUMN_NAMES ) {
if(outputLine.length() != 0) {
outputLine.append( AnnotatorROD.DELIMITER );
}
final String value = outputLineFields.get(column);
outputLine.append(value != null ? value : "");
//generate the output line and add it to 'result' map.
if(!isWithinIntronAndFarFromSpliceJunction) {
final String sortKey = computeSortKey(parsedRefGeneRod.txChrom, txCoord_5to3, haplotypeAlternate, parsedRefGeneRod);
addThisLineToResult(sortKey, outputLineFields, result);
}
outputStream.println(outputLine);
if( !isProteinCodingTranscript ) {
if( haplotypeAlternate == '*' ) {
//need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles
break;
}
} //ALLELE for-loop
}
finally
{
//increment coords
txOffset_from5++;
if(positionType == PositionType.CDS) {
if(positionType == PositionType.utr5) {
utr5Count_from5++;
} else if(positionType == PositionType.CDS) {
frame = (frame + 1) % 3;
if(frame == 0) {
codonCount_from5++;
}
}
if(isWithinExon) {
codingCoord_from5++;
}
} // l for-loop
} //method close
/**
* Increment the number of loci processed.
* Utility method. Creates a line containing the outputLineFields, and adds it to result, hashed by the sortKey.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
* @param sortKey Reference-order sort key
* @param outputLineFields Column-name to value pairs.
* @param result The hashmap that accumulates all results.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
private void addThisLineToResult(final String sortKey, final Map<String, String> outputLineFields, TreeMap<String, String> result) {
final StringBuilder outputLine = new StringBuilder();
for( final String column : OUTPUT_COLUMN_NAMES ) {
if(outputLine.length() != 0) {
outputLine.append( AnnotatorROD.DELIMITER );
}
final String value = outputLineFields.get(column);
if(value != null) {
outputLine.append(value);
}
}
result.put(sortKey, outputLine.toString());
}
/**
* Merge the "value" file with the "sum" file and return the file name
* of the merged data set.
*/
@Override
public TreeMap<String, String> reduce(TreeMap<String, String> value, TreeMap<String, String> sum)
{
if(value != null) {
sum.putAll(value);
}
return sum;
}
@Override
public TreeMap<String, String> treeReduce(TreeMap<String, String> lhs, TreeMap<String, String> rhs)
{
lhs.putAll(rhs);
return lhs;
}
/**
* Computes a reference-order sort key for a record with the given
* UCSC chromosome name, position, and allele at that position.
*
* @param txChrom
* @param position
* @param allele
* @param parsedRefGeneRod
* @return
*/
protected String computeSortKey(String txChrom, long position, char allele, RefGeneTranscriptRecord parsedRefGeneRod)
{
//compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
long key;
final char lastChromChar = txChrom.charAt(txChrom.length() -1);
switch( Character.toLowerCase(lastChromChar) ) {
case 'm':
key = 0;
break;
case 'x':
key = 23;
break;
case 'y':
key = 24;
break;
default:
key = Integer.parseInt(txChrom.substring(3));
break;
}
key++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
key += 100; //add a leading 1 to avoid having to create leading zeros
key *= 10*1000L*1000L*1000L*1000L; //this way position doesn't overlap with chromosome keys
key += position*1000L;
key += allele;
String result = Long.toString(key);
if(parsedRefGeneRod != null) {
result += (parsedRefGeneRod.name + parsedRefGeneRod.name2);
}
return result;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
* Moves the file to the destination directory.
*
* @param result the number of loci seen.
* @param result The file of the result
*/
public void onTraversalDone(Integer result) {
out.printf("Processed %d loci.\n", result);
outputStream.close();
public void onTraversalDone(TreeMap<String, String> result)
{
if(result == null) {
logger.info("Result is: null. No output file was generated.\n");
return;
}
//move the fully merged temp file to the output file.
logger.info("Result - writing " + result.size() + " lines to: " + outputFilename);
BufferedWriter fileWriter = null;
try {
fileWriter = new BufferedWriter(new FileWriter(outputFilename));
fileWriter.write(OUTPUT_HEADER_LINE + '\n');
for(String value : result.values() ) {
fileWriter.write(value + "\n");
}
} catch(IOException e) {
logger.info("Unable to write to file: " + outputFilename);
} finally {
try {
if (fileWriter != null)
fileWriter.close();
} catch(IOException e) {
logger.info("Unable to write to file: " + outputFilename);
}
}
logger.info("Deleting temp files..");
}
}