Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing annotations for each possible variant at each transcript position (eg. 4 variants for each position).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3238 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
653e08c0b6
commit
e643a9e7a5
|
|
@ -0,0 +1,642 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.By;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a refGene table ( -B arg must be: -B refgene,AnnotatorInfoTable,/path/to/refgene_file.txt) and generates the big table of nucleotides containing
|
||||||
|
* annotations for each possible variant at each transcript position (eg. 4 variants for each position).
|
||||||
|
*/
|
||||||
|
@Reference(window=@Window(start=-4,stop=4))
|
||||||
|
@By(DataSource.REFERENCE)
|
||||||
|
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="refgene",type=AnnotatorROD.class) })
|
||||||
|
public class TranscriptToInfo extends RodWalker<Integer, Integer>
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
RefGene column names:
|
||||||
|
# bin 182 smallint(5) unsigned range Indexing field to speed chromosome range queries.
|
||||||
|
# name NM_021649 varchar(255) values Name of gene (usually transcript_id from GTF)
|
||||||
|
# chrom chr5 varchar(255) values Reference sequence chromosome or scaffold
|
||||||
|
# strand - char(1) values + or - for strand
|
||||||
|
# txStart 114942238 int(10) unsigned range Transcription start position
|
||||||
|
# txEnd 114966008 int(10) unsigned range Transcription end position
|
||||||
|
# cdsStart 114944144 int(10) unsigned range Coding region start
|
||||||
|
# cdsEnd 114944852 int(10) unsigned range Coding region end
|
||||||
|
# exonCount 2 int(10) unsigned range Number of exons
|
||||||
|
# exonStarts 114942238,114965692, longblob Exon start positions
|
||||||
|
# exonEnds 114944911,114966008, longblob Exon end positions
|
||||||
|
# id 0 int(10) unsigned range Unique identifier
|
||||||
|
# name2 TICAM2 varchar(255) values Alternate name (e.g. gene_id from GTF)
|
||||||
|
# cdsStartStat cmpl enum('none','unk','incmpl','cmpl') values enum('none','unk','incmpl','cmpl')
|
||||||
|
# cdsEndStat cmpl enum('none','unk','incmpl','cmpl') values enum('none','unk','incmpl','cmpl')
|
||||||
|
# exonFrames 0,-1, longblob Exon frame {0,1,2}, or -1 if no frame for exon
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public static final int OUTPUT_STREAM_BUFFER_SIZE = 8*1024*1024;
|
||||||
|
|
||||||
|
/** Output columns */
|
||||||
|
public static final String OUTPUT_SORT_KEY = "00000_SORT_KEY";
|
||||||
|
|
||||||
|
public static final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN;
|
||||||
|
public static final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
|
||||||
|
public static final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
|
||||||
|
public static final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN;
|
||||||
|
|
||||||
|
public static final String OUTPUT_GENE_NAME = "geneName"; //eg. NDUFS2
|
||||||
|
public static final String OUTPUT_TRANSCRIPT_ACCESSION = "transcriptId"; //eg. NM_001212414.4
|
||||||
|
|
||||||
|
//public static final String OUTPUT_IN_TRANSCRIPT = "inTranscript"; //eg. true
|
||||||
|
public static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true
|
||||||
|
|
||||||
|
public static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2
|
||||||
|
public static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic
|
||||||
|
|
||||||
|
public static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript
|
||||||
|
public static final String OUTPUT_CODING_COORD = "codingCoord"; //1-based offset within the cds region
|
||||||
|
|
||||||
|
public static final String OUTPUT_SPLICE_DISTANCE = "minSpliceDistance"; //eg. 40
|
||||||
|
|
||||||
|
public static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20
|
||||||
|
public static final String OUTPUT_REFERENCE_CODON = "referenceCodon";
|
||||||
|
public static final String OUTPUT_REFERENCE_AA = "referenceAA";
|
||||||
|
public static final String OUTPUT_VARIANT_CODON = "variantCodon";
|
||||||
|
public static final String OUTPUT_VARIANT_AA = "variantAA";
|
||||||
|
|
||||||
|
public static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true
|
||||||
|
|
||||||
|
public static final String OUTPUT_CODING_COORD_STR = "coding_coord_str";
|
||||||
|
public static final String OUTPUT_PROTEIN_COORD_STR = "protein_coord_str";
|
||||||
|
public static final String OUTPUT_INTRON_COORD_STR = "intron_coord_str";
|
||||||
|
|
||||||
|
public static final String[] OUTPUT_COLUMN_NAMES = new String[] {
|
||||||
|
OUTPUT_SORT_KEY,
|
||||||
|
|
||||||
|
OUTPUT_CHRPOS,
|
||||||
|
OUTPUT_HAPLOTYPE_REFERENCE,
|
||||||
|
OUTPUT_HAPLOTYPE_ALTERNATE,
|
||||||
|
OUTPUT_HAPLOTYPE_STRAND,
|
||||||
|
|
||||||
|
OUTPUT_GENE_NAME,
|
||||||
|
OUTPUT_TRANSCRIPT_ACCESSION,
|
||||||
|
|
||||||
|
OUTPUT_POSITION_TYPE,
|
||||||
|
|
||||||
|
|
||||||
|
OUTPUT_FRAME,
|
||||||
|
OUTPUT_MRNA_COORD,
|
||||||
|
OUTPUT_CODING_COORD,
|
||||||
|
OUTPUT_CODON_NUMBER,
|
||||||
|
OUTPUT_SPLICE_DISTANCE,
|
||||||
|
|
||||||
|
OUTPUT_REFERENCE_CODON,
|
||||||
|
OUTPUT_REFERENCE_AA,
|
||||||
|
OUTPUT_VARIANT_CODON,
|
||||||
|
OUTPUT_VARIANT_AA,
|
||||||
|
OUTPUT_CHANGES_AMINO_ACID,
|
||||||
|
|
||||||
|
OUTPUT_CODING_COORD_STR,
|
||||||
|
OUTPUT_PROTEIN_COORD_STR,
|
||||||
|
OUTPUT_INTRON_COORD_STR,
|
||||||
|
|
||||||
|
OUTPUT_IN_CODING_REGION,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Container for all data fields from a single refGene.txt row.
|
||||||
|
*/
|
||||||
|
static class RefGeneTranscriptRecord
|
||||||
|
{
|
||||||
|
public static final String REFGENE_NAME = "name"; //eg. NM_021649
|
||||||
|
public static final String REFGENE_STRAND = "strand"; //eg. +
|
||||||
|
public static final String REFGENE_TXSTART = "txStart";
|
||||||
|
public static final String REFGENE_TXEND = "txEnd";
|
||||||
|
public static final String REFGENE_CDS_START = "cdsStart";
|
||||||
|
public static final String REFGENE_CDS_END = "cdsEnd";
|
||||||
|
public static final String REFGENE_EXON_COUNT = "exonCount";
|
||||||
|
public static final String REFGENE_EXON_STARTS = "exonStarts";
|
||||||
|
public static final String REFGENE_EXON_ENDS = "exonEnds";
|
||||||
|
public static final String REFGENE_EXON_FRAMES = "exonFrames";
|
||||||
|
public static final String REFGENE_NAME2 = "name2"; //eg. TICAM2
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This StringBuffer accumulates the entire transcript sequence.
|
||||||
|
* This buffer is used instead of using the GATK window mechanism
|
||||||
|
* because arbitrary-length look-aheads and look-behinds are needed to deal
|
||||||
|
* with codons that span splice-junctions. The window mechanism
|
||||||
|
* requires hard-coding the window size, which would translate into a
|
||||||
|
* limit on maximum supported intron size. To avoid this, the
|
||||||
|
* sequence is accumulated as the transcript is scanned left-to-right.
|
||||||
|
* Then, all calculations are performed at the end.
|
||||||
|
*/
|
||||||
|
public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3'
|
||||||
|
public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3'
|
||||||
|
|
||||||
|
public boolean positiveStrand;
|
||||||
|
public String name; //eg. NM_021649
|
||||||
|
public String name2; //eg. TICAM2
|
||||||
|
|
||||||
|
public String txChrom; //The chromosome
|
||||||
|
public long txStart;
|
||||||
|
public long txEnd;
|
||||||
|
|
||||||
|
public long cdsStart;
|
||||||
|
public long cdsEnd;
|
||||||
|
|
||||||
|
public long[] exonStarts;
|
||||||
|
public long[] exonEnds;
|
||||||
|
public int[] exonFrames;
|
||||||
|
|
||||||
|
private AnnotatorROD rod;
|
||||||
|
|
||||||
|
|
||||||
|
public RefGeneTranscriptRecord(final AnnotatorROD refGeneRod) {
|
||||||
|
this.rod = refGeneRod;
|
||||||
|
//String binStr = rod.get("bin");
|
||||||
|
//String idStr = refGeneRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
|
||||||
|
|
||||||
|
String strandStr = refGeneRod.get("strand");
|
||||||
|
if(strandStr.equals("+")) {
|
||||||
|
positiveStrand = true;
|
||||||
|
} else if(strandStr.equals("-")) {
|
||||||
|
positiveStrand = false;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("refGene record contains unexpected strand value: \"" + strandStr + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
name = refGeneRod.get("name");
|
||||||
|
name2 = refGeneRod.get("name2");
|
||||||
|
|
||||||
|
//String txStartStr = refGeneRod.get(REFGENE_TXSTART); //These fields are used to generate column 1 of the ROD file (eg. they get turned into chr:txStart-txStop)
|
||||||
|
//String txEndStr = refGeneRod.get(REFGENE_TXEND);
|
||||||
|
|
||||||
|
final GenomeLoc loc = refGeneRod.getLocation();
|
||||||
|
txChrom = loc.getContig();
|
||||||
|
txStart = loc.getStart();
|
||||||
|
txEnd = loc.getStop();
|
||||||
|
|
||||||
|
txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3'
|
||||||
|
cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
|
||||||
|
|
||||||
|
String cdsStartStr = refGeneRod.get(REFGENE_CDS_START);
|
||||||
|
String cdsEndStr = refGeneRod.get(REFGENE_CDS_END);
|
||||||
|
|
||||||
|
cdsStart = Long.parseLong(cdsStartStr);
|
||||||
|
cdsEnd = Long.parseLong(cdsEndStr);
|
||||||
|
|
||||||
|
String exonCountStr = refGeneRod.get(REFGENE_EXON_COUNT);
|
||||||
|
String exonStartsStr = refGeneRod.get(REFGENE_EXON_STARTS);
|
||||||
|
String exonEndsStr = refGeneRod.get(REFGENE_EXON_ENDS);
|
||||||
|
String exonFramesStr = refGeneRod.get(REFGENE_EXON_FRAMES);
|
||||||
|
|
||||||
|
String[] exonStartStrs = exonStartsStr.split(",");
|
||||||
|
String[] exonEndStrs = exonEndsStr.split(",");
|
||||||
|
String[] exonFrameStrs = exonFramesStr.split(",");
|
||||||
|
|
||||||
|
int exonCount = Integer.parseInt(exonCountStr);
|
||||||
|
if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length || exonCount != exonFrameStrs.length)
|
||||||
|
{
|
||||||
|
throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + ", Exon frames: " + exonFramesStr + ", Exon count: " + exonCountStr +". refGeneRod = " + refGeneRod);
|
||||||
|
}
|
||||||
|
|
||||||
|
exonStarts = new long[exonCount];
|
||||||
|
exonEnds = new long[exonCount];
|
||||||
|
exonFrames = new int[exonCount];
|
||||||
|
for(int i = 0; i < exonCount; i++) {
|
||||||
|
exonStarts[i] = Long.parseLong(exonStartStrs[i]);
|
||||||
|
exonEnds[i] = Long.parseLong(exonEndStrs[i]);
|
||||||
|
exonFrames[i] = Integer.parseInt(exonFrameStrs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a genomic position on the same contig as the transcript, and
|
||||||
|
* returns true if this position falls within an exon.
|
||||||
|
*/
|
||||||
|
public boolean isWithinExon(final long genomPosition) {
|
||||||
|
for(int i = 0; i < exonStarts.length; i++) {
|
||||||
|
final long curStart = exonStarts[i];
|
||||||
|
if(genomPosition < curStart) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final long curStop = exonEnds[i];
|
||||||
|
if(genomPosition <= curStop) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the distance to the nearest splice-site.
|
||||||
|
* Returns 1 when down-stream of splice-juction, -1 when upstream
|
||||||
|
*/
|
||||||
|
public int distanceToSpliceSite(final long genomPosition) {
|
||||||
|
int prevDistance = Integer.MAX_VALUE;
|
||||||
|
int curDistance;
|
||||||
|
for(int i = 0; i < exonStarts.length; i++) {
|
||||||
|
final long curStart = exonStarts[i];
|
||||||
|
curDistance = (int) (curStart-genomPosition);
|
||||||
|
if(genomPosition < curStart) {
|
||||||
|
return Math.min(prevDistance, curDistance);
|
||||||
|
} else {
|
||||||
|
prevDistance = (int) (genomPosition - curStart) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long curStop = exonEnds[i];
|
||||||
|
curDistance = (int) (curStop-genomPosition) + 1;
|
||||||
|
if(genomPosition <= curStop) {
|
||||||
|
return Math.min(prevDistance, curDistance);
|
||||||
|
} else {
|
||||||
|
prevDistance = (int) (genomPosition - curStop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return prevDistance; //out of exons. return genomPosition-curStop
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if this is a coding transcript (eg. is translated
|
||||||
|
* into proteins). Returns false for non-coding RNA.
|
||||||
|
*/
|
||||||
|
public boolean isProteinCodingTranscript() {
|
||||||
|
return cdsStart < cdsEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return rod.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected int transcriptsProcessedCounter = 0;
|
||||||
|
|
||||||
|
|
||||||
|
/** Possible values for the "POSITION_TYPE" output column. */
|
||||||
|
enum PositionType {
|
||||||
|
intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Output stream for computed results */
|
||||||
|
private PrintStream outputStream;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare the output file and the list of available features.
|
||||||
|
*/
|
||||||
|
public void initialize() {
|
||||||
|
//init the output file
|
||||||
|
String outputFilename = getToolkit().getArguments().outFileName;
|
||||||
|
if(outputFilename == null ) {
|
||||||
|
outputStream = System.out;
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
outputStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFilename), OUTPUT_STREAM_BUFFER_SIZE));
|
||||||
|
} catch(Exception e) {
|
||||||
|
throw new StingException("Unable to open output file for writing: " + outputFilename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
outputStream.println(Utils.join(AnnotatorROD.DELIMITER, OUTPUT_COLUMN_NAMES));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the number of loci processed to zero.
|
||||||
|
*
|
||||||
|
* @return 0
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We want reads that span deletions
|
||||||
|
*
|
||||||
|
* @return true
|
||||||
|
*/
|
||||||
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each site of interest, generate the appropriate fields.
|
||||||
|
*
|
||||||
|
* @param tracker the meta-data tracker
|
||||||
|
* @param ref the reference base
|
||||||
|
* @param context the context for the given locus
|
||||||
|
* @return 1 if the locus was successfully processed, 0 if otherwise
|
||||||
|
*/
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
|
Collection<RODRecordList> rods = tracker.getBoundRodTracks();
|
||||||
|
if(rods.size() == 0) {
|
||||||
|
//if there's nothing overlapping this locus, skip it.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
final List<Object> refGeneRODs = (List<Object>) tracker.getReferenceMetaData("refgene");
|
||||||
|
|
||||||
|
for(Object refGeneRodObject : refGeneRODs) {
|
||||||
|
if(! (refGeneRodObject instanceof AnnotatorROD) ) {
|
||||||
|
throw new RuntimeException("The 'refgene' -B arg must have the type: 'AnnotatorInputTable'.");
|
||||||
|
}
|
||||||
|
|
||||||
|
AnnotatorROD refGeneRod = (AnnotatorROD) refGeneRodObject;
|
||||||
|
RefGeneTranscriptRecord parsedRefGeneRod = (RefGeneTranscriptRecord) refGeneRod.getTemporaryAttribute("parsedRefGeneRod");
|
||||||
|
if( parsedRefGeneRod == null ) {
|
||||||
|
parsedRefGeneRod = new RefGeneTranscriptRecord(refGeneRod);
|
||||||
|
refGeneRod.setTemporaryAttribute("parsedRefGeneRod", parsedRefGeneRod);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(parsedRefGeneRod.positiveStrand) {
|
||||||
|
parsedRefGeneRod.txSequence.append(ref.getBase());
|
||||||
|
} else {
|
||||||
|
parsedRefGeneRod.txSequence.insert(0, ref.getBase());
|
||||||
|
}
|
||||||
|
|
||||||
|
final long position = ref.getLocus().getStart();
|
||||||
|
if(parsedRefGeneRod.isProteinCodingTranscript() && position >= parsedRefGeneRod.cdsStart && position <= parsedRefGeneRod.cdsEnd && parsedRefGeneRod.isWithinExon(position) ) {
|
||||||
|
if(parsedRefGeneRod.positiveStrand) {
|
||||||
|
parsedRefGeneRod.cdsSequence.append(ref.getBase());
|
||||||
|
} else {
|
||||||
|
parsedRefGeneRod.cdsSequence.insert(0, ref.getBase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(position == parsedRefGeneRod.txEnd) {
|
||||||
|
//we've reached the end of the transcript - compute all data and write it out.
|
||||||
|
generateOutputRecordsForROD(parsedRefGeneRod);
|
||||||
|
|
||||||
|
transcriptsProcessedCounter++;
|
||||||
|
if(transcriptsProcessedCounter % 100 == 0) {
|
||||||
|
System.err.println(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final char[] ALLELES = {'A','C','G','T'};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private void generateOutputRecordsForROD(RefGeneTranscriptRecord parsedRefGeneRod) {
|
||||||
|
|
||||||
|
//Transcripts that don't produce proteins are indicated in refGene by cdsStart == cdsEnd
|
||||||
|
//These will be handled by generating only one record, with haplotypeAlternate == "*".
|
||||||
|
final boolean isProteinCodingTranscript = parsedRefGeneRod.isProteinCodingTranscript();
|
||||||
|
|
||||||
|
final boolean positiveStrand = parsedRefGeneRod.positiveStrand; //shorten the name
|
||||||
|
|
||||||
|
if(isProteinCodingTranscript && parsedRefGeneRod.cdsSequence.length() % 3 != 0) {
|
||||||
|
System.err.println("WARNING: The following record has " + parsedRefGeneRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping... \n" + parsedRefGeneRod.toString());
|
||||||
|
//TODO figure out why all CDS lengths aren't in multiples of 3 nucleotides - Sarah says its ok to thow these out.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//here, 5' and 3' are according to the DNA (not on the RNA)
|
||||||
|
int frame = 0; //the frame of the current position
|
||||||
|
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
|
||||||
|
int proteinOffset_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand
|
||||||
|
int codonOffset_from5 = 0; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons (this count is 1-based even though its set to 0 here.)
|
||||||
|
|
||||||
|
char[] currentCodon_5to3 = null; //holds the current RNA codon - 5' to 3'
|
||||||
|
|
||||||
|
|
||||||
|
final long txStart_5prime = positiveStrand ? parsedRefGeneRod.txStart : parsedRefGeneRod.txEnd; //1-based, inclusive
|
||||||
|
final long txEnd_3prime = positiveStrand ? parsedRefGeneRod.txEnd : parsedRefGeneRod.txStart; //1-based, inclusive
|
||||||
|
final int increment_5to3 = positiveStrand ? 1 : -1;
|
||||||
|
final int strandSign = increment_5to3; //alias
|
||||||
|
|
||||||
|
final long cdsStart_5prime = positiveStrand ? parsedRefGeneRod.cdsStart : parsedRefGeneRod.cdsEnd; //1-based, inclusive
|
||||||
|
final long cdsEnd_3prime = positiveStrand ? parsedRefGeneRod.cdsEnd : parsedRefGeneRod.cdsStart ; //1-based, inclusive
|
||||||
|
|
||||||
|
//compute chromosome key (later used to sort the output file)
|
||||||
|
boolean mitochondrial = false;
|
||||||
|
long chromKey;
|
||||||
|
final char lastChromChar = parsedRefGeneRod.txChrom.charAt(parsedRefGeneRod.txChrom.length() -1);
|
||||||
|
switch( Character.toLowerCase(lastChromChar) ) {
|
||||||
|
case 'm':
|
||||||
|
chromKey = 0;
|
||||||
|
mitochondrial = true;
|
||||||
|
break;
|
||||||
|
case 'x':
|
||||||
|
chromKey = 23;
|
||||||
|
break;
|
||||||
|
case 'y':
|
||||||
|
chromKey = 24;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
chromKey = Integer.parseInt(parsedRefGeneRod.txChrom.substring(3));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
chromKey++; //shift so there's no zero (otherwise, multiplication is screwed up in the next step)
|
||||||
|
chromKey *= 100000000000L;
|
||||||
|
|
||||||
|
|
||||||
|
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
|
||||||
|
{
|
||||||
|
//get the reference sequence
|
||||||
|
final char haplotypeReference = parsedRefGeneRod.txSequence.charAt( txOffset_from5 - 1 );
|
||||||
|
|
||||||
|
//whether the current txCoord_5to3 is within an exon
|
||||||
|
final boolean isWithinExon = parsedRefGeneRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron
|
||||||
|
|
||||||
|
//figure out what region the current position is in.
|
||||||
|
PositionType positionType;
|
||||||
|
if(isProteinCodingTranscript)
|
||||||
|
{
|
||||||
|
if(isWithinExon)
|
||||||
|
{
|
||||||
|
if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) {
|
||||||
|
positionType = PositionType.utr5;
|
||||||
|
} else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) {
|
||||||
|
positionType = PositionType.utr3;
|
||||||
|
} else {
|
||||||
|
positionType = PositionType.CDS;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
positionType = PositionType.intron;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(isWithinExon) {
|
||||||
|
positionType = PositionType.non_coding_exon;
|
||||||
|
} else {
|
||||||
|
positionType = PositionType.non_coding_intron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//compute current codon
|
||||||
|
if(positionType == PositionType.CDS && frame == 0)
|
||||||
|
{
|
||||||
|
if(currentCodon_5to3 == null) {
|
||||||
|
currentCodon_5to3 = new char[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
codonOffset_from5++;
|
||||||
|
|
||||||
|
currentCodon_5to3[0] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 - 1 ); //subtract 1 to go to zero-based coords
|
||||||
|
currentCodon_5to3[1] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 );
|
||||||
|
currentCodon_5to3[2] = parsedRefGeneRod.cdsSequence.charAt( proteinOffset_from5 + 1);
|
||||||
|
|
||||||
|
if(!positiveStrand) {
|
||||||
|
currentCodon_5to3[0] = BaseUtils.simpleComplement(currentCodon_5to3[0]);
|
||||||
|
currentCodon_5to3[1] = BaseUtils.simpleComplement(currentCodon_5to3[1]);
|
||||||
|
currentCodon_5to3[2] = BaseUtils.simpleComplement(currentCodon_5to3[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final String distanceToSpliceSiteString = Integer.toString(parsedRefGeneRod.distanceToSpliceSite(txCoord_5to3));
|
||||||
|
|
||||||
|
for(final char haplotypeAlternate : ALLELES )
|
||||||
|
{
|
||||||
|
final LinkedHashMap<String, String> outputLineFields = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_SORT_KEY, Long.toString( chromKey + txCoord_5to3) );
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_CHRPOS, parsedRefGeneRod.txChrom + ":" + txCoord_5to3);
|
||||||
|
outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( haplotypeReference ) );
|
||||||
|
outputLineFields.put(OUTPUT_HAPLOTYPE_ALTERNATE, isProteinCodingTranscript ? Character.toString( haplotypeAlternate ) : "*");
|
||||||
|
outputLineFields.put(OUTPUT_HAPLOTYPE_STRAND, positiveStrand ? "+" : "-");
|
||||||
|
outputLineFields.put(OUTPUT_GENE_NAME, parsedRefGeneRod.name2 );
|
||||||
|
outputLineFields.put(OUTPUT_TRANSCRIPT_ACCESSION, parsedRefGeneRod.name );
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() );
|
||||||
|
outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(txOffset_from5) );
|
||||||
|
outputLineFields.put(OUTPUT_SPLICE_DISTANCE, distanceToSpliceSiteString );
|
||||||
|
|
||||||
|
if(isProteinCodingTranscript)
|
||||||
|
{
|
||||||
|
outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) );
|
||||||
|
|
||||||
|
if(isWithinExon) {
|
||||||
|
outputLineFields.put(OUTPUT_CODING_COORD, Integer.toString(proteinOffset_from5) );
|
||||||
|
|
||||||
|
if(positionType == PositionType.CDS) {
|
||||||
|
final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
||||||
|
outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame) );
|
||||||
|
outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonOffset_from5) );
|
||||||
|
outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p."+ Integer.toString(codonOffset_from5) );
|
||||||
|
outputLineFields.put(OUTPUT_CODING_COORD_STR, "c."+ Integer.toString(proteinOffset_from5) );
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon );
|
||||||
|
final String refAA = AminoAcidTable.getAA3LetterCode(referenceCodon, mitochondrial);
|
||||||
|
outputLineFields.put(OUTPUT_REFERENCE_AA, refAA);
|
||||||
|
|
||||||
|
final char temp = currentCodon_5to3[frame];
|
||||||
|
currentCodon_5to3[frame] = haplotypeAlternate;
|
||||||
|
final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2];
|
||||||
|
final String variantAA = AminoAcidTable.getAA3LetterCode(variantCodon, mitochondrial);
|
||||||
|
outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon );
|
||||||
|
outputLineFields.put(OUTPUT_VARIANT_AA, variantAA);
|
||||||
|
|
||||||
|
currentCodon_5to3[frame] = temp;
|
||||||
|
|
||||||
|
outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(!refAA.equals(variantAA)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//write out the record
|
||||||
|
StringBuilder outputLine = new StringBuilder();
|
||||||
|
for( String column : OUTPUT_COLUMN_NAMES) {
|
||||||
|
if(outputLine.length() != 0) {
|
||||||
|
outputLine.append(AnnotatorROD.DELIMITER);
|
||||||
|
}
|
||||||
|
String value = outputLineFields.get(column);
|
||||||
|
outputLine.append(value != null ? value : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
outputStream.println(outputLine);
|
||||||
|
|
||||||
|
if( !isProteinCodingTranscript ) {
|
||||||
|
//need only one record for this position, instead of 4 (one for each allele)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} //ALLELE for-loop
|
||||||
|
|
||||||
|
|
||||||
|
txOffset_from5++;
|
||||||
|
|
||||||
|
if(positionType == PositionType.CDS) {
|
||||||
|
proteinOffset_from5++;
|
||||||
|
frame = (frame + 1) % 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // l for-loop
|
||||||
|
|
||||||
|
} //method close
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increment the number of loci processed.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return the new number of loci processed.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return sum + value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tell the user the number of loci processed and close out the new variants file.
|
||||||
|
*
|
||||||
|
* @param result the number of loci seen.
|
||||||
|
*/
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
out.printf("Processed %d loci.\n", result);
|
||||||
|
|
||||||
|
outputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue