Updated to use Tribble-based GATKFeature instead of TabularROD

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3389 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
weisburd 2010-05-19 03:41:37 +00:00
parent d8469e2fba
commit 42ee16f256
1 changed files with 57 additions and 58 deletions

View File

@ -20,8 +20,9 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableCodec;
import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
@ -52,7 +53,7 @@ import org.broadinstitute.sting.utils.StingException;
*/
@Reference(window=@Window(start=-4,stop=4))
@By(DataSource.REFERENCE)
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorROD.class) })
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorInputTableFeature.class) } )
public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap<String, String>> implements TreeReducible<TreeMap<String, String>>
{
private static final String ROD_NAME = "transcripts";
@ -68,10 +69,10 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
private final char[] ALLELES = {'A','C','G','T'};
/** Output columns */
private final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN;
private final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN;
private final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN;
private final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN;
private final String OUTPUT_CHRPOS = GenomicAnnotation.CHRPOS_COLUMN;
private final String OUTPUT_HAPLOTYPE_REFERENCE = GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN;
private final String OUTPUT_HAPLOTYPE_ALTERNATE = GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN;
private final String OUTPUT_HAPLOTYPE_STRAND = GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN;
private final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true
@ -142,7 +143,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
final ArrayList<String> header;
try {
header = AnnotatorROD.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile());
header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile());
} catch(Exception e) {
throw new StingException("Failed when attempting to read header from file: " + transcriptsDataSource.getReferenceOrderedData().getFile(), e);
}
@ -183,7 +184,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
StringBuilder outputHeaderLine = new StringBuilder();
for( final String column : outputColumnNames ) {
if(outputHeaderLine.length() != 0) {
outputHeaderLine.append( AnnotatorROD.DELIMITER );
outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER );
}
outputHeaderLine.append(column);
}
@ -249,7 +250,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
for(Object transcriptRodObject : transcriptRODs)
{
//parse this ROD if it hasn't been already.
final AnnotatorROD transcriptRod = (AnnotatorROD) transcriptRodObject;
final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject;
TranscriptTableRecord parsedTranscriptRod = (TranscriptTableRecord) transcriptRod.getTemporaryAttribute("parsedTranscriptRod");
if( parsedTranscriptRod == null ) {
parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS);
@ -265,7 +266,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
}
//populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence
final long position = ref.getLocus().getStart();
final int position = (int) ref.getLocus().getStart();
if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) )
{
//we're within an exon of a proteinCodingTranscript
@ -372,13 +373,13 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
return;
}
final long txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive
final long txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive
final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive
final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive
final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement
final int strandSign = increment_5to3; //alias
final long cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive
final long cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive
final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive
final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive
int frame = 0; //the frame of the current position
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
@ -395,12 +396,12 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
PositionType positionType = null;
boolean isWithinIntronAndFarFromSpliceJunction = false;
long intronStart_5prime = -1;
long intronEnd_5prime = -1;
int intronStart_5prime = -1;
int intronEnd_5prime = -1;
final Map<String, String> outputLineFields = new HashMap<String, String>();
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
{
++totalPositionsCounter;
@ -466,8 +467,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
//output intron record
intronEnd_5prime = txCoord_5to3 - increment_5to3;
final long intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ;
final long intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime);
final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ;
final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime);
outputLineFields.clear();
outputLineFields.put(OUTPUT_CHRPOS, parsedTranscriptRod.txChrom + ":" + intronStart + "-" + intronEnd);
outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) );
@ -758,7 +759,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
final StringBuilder outputLine = new StringBuilder();
for( final String column : outputColumnNames ) {
if(outputLine.length() != 0) {
outputLine.append( AnnotatorROD.DELIMITER );
outputLine.append( AnnotatorInputTableCodec.DELIMITER );
}
final String value = outputLineFields.get(column);
if(value != null) {
@ -804,7 +805,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* @param parsedTranscriptRod
* @return
*/
protected String computeSortKey(String txChrom, long position, char allele, TranscriptTableRecord parsedTranscriptRod)
protected String computeSortKey(String txChrom, int position, char allele, TranscriptTableRecord parsedTranscriptRod)
{
//compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
long key;
@ -909,17 +910,17 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
public String[] geneNames; //eg. NM_021649
public String txChrom; //The chromosome name
public long txStart;
public long txEnd;
public int txStart;
public int txEnd;
public long cdsStart;
public long cdsEnd;
public int cdsStart;
public int cdsEnd;
public long[] exonStarts;
public long[] exonEnds;
public int[] exonStarts;
public int[] exonEnds;
//public int[] exonFrames; - not used for anything, frame is computed another way
private AnnotatorROD rod;
private AnnotatorInputTableFeature rod;
/**
@ -927,11 +928,11 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
*
* @param transcriptRod A rod representing a single record in the transcript table.
*/
public TranscriptTableRecord(final AnnotatorROD transcriptRod, String[] geneNameColumns) {
public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) {
this.rod = transcriptRod;
//String binStr = transcriptRod.get("bin");
//String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
String strandStr = transcriptRod.get(STRAND_COLUMN);
String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN);
if(strandStr == null) {
throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\"");
} else if(strandStr.equals("+")) {
@ -944,22 +945,20 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
geneNames = new String[geneNameColumns.length];
for(int i = 0; i < geneNameColumns.length; i++) {
geneNames[i] = transcriptRod.get(geneNameColumns[i]);
geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]);
}
//String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop)
//String txEndStr = transcriptRod.get(TXEND_COLUMN);
txChrom = transcriptRod.getChr();
txStart = transcriptRod.getStart();
txEnd = transcriptRod.getEnd();
final GenomeLoc loc = transcriptRod.getLocation();
txChrom = loc.getContig();
txStart = loc.getStart();
txEnd = loc.getStop();
String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN);
String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN);
String cdsStartStr = transcriptRod.get(CDS_START_COLUMN);
String cdsEndStr = transcriptRod.get(CDS_END_COLUMN);
cdsStart = Long.parseLong(cdsStartStr);
cdsEnd = Long.parseLong(cdsEndStr);
cdsStart = Integer.parseInt(cdsStartStr);
cdsEnd = Integer.parseInt(cdsEndStr);
txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3'
if(isProteinCodingTranscript()) {
@ -967,9 +966,9 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
}
String exonCountStr = transcriptRod.get(EXON_COUNT_COLUMN);
String exonStartsStr = transcriptRod.get(EXON_STARTS_COLUMN);
String exonEndsStr = transcriptRod.get(EXON_ENDS_COLUMN);
String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN);
String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN);
String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN);
//String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN);
String[] exonStartStrs = exonStartsStr.split(",");
@ -982,12 +981,12 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod);
}
exonStarts = new long[exonCount];
exonEnds = new long[exonCount];
exonStarts = new int[exonCount];
exonEnds = new int[exonCount];
//exonFrames = new int[exonCount];
for(int i = 0; i < exonCount; i++) {
exonStarts[i] = Long.parseLong(exonStartStrs[i]);
exonEnds[i] = Long.parseLong(exonEndStrs[i]);
exonStarts[i] = Integer.parseInt(exonStartStrs[i]);
exonEnds[i] = Integer.parseInt(exonEndStrs[i]);
//exonFrames[i] = Integer.parseInt(exonFrameStrs[i]);
}
}
@ -997,13 +996,13 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* Takes a genomic position on the same contig as the transcript, and
* returns true if this position falls within an exon.
*/
public boolean isWithinExon(final long genomPosition) {
public boolean isWithinExon(final int genomPosition) {
for(int i = 0; i < exonStarts.length; i++) {
final long curStart = exonStarts[i];
final int curStart = exonStarts[i];
if(genomPosition < curStart) {
return false;
}
final long curStop = exonEnds[i];
final int curStop = exonEnds[i];
if(genomPosition <= curStop) {
return true;
}
@ -1017,10 +1016,10 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* The returned value is negative its on the 5' side (eg. upstream) of the juntion, and
* positive if its on the 3' side.
*/
public int computeDistanceToNearestSpliceSite(final long genomPosition) {
public int computeDistanceToNearestSpliceSite(final int genomPosition) {
int prevDistance = Integer.MAX_VALUE;
for(int i = 0; i < exonStarts.length; i++) {
final long curStart = exonStarts[i];
final int curStart = exonStarts[i];
int curDistance = (int) (curStart-genomPosition);
if(genomPosition < curStart) {
//position is within the current intron
@ -1033,7 +1032,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
prevDistance = (int) (genomPosition - curStart) + 1;
}
final long curStop = exonEnds[i];
final int curStop = exonEnds[i];
curDistance = (int) (curStop-genomPosition) + 1;
if(genomPosition <= curStop) {
//position is within an exon
@ -1089,8 +1088,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
int result = 0;
for(int i = 0; i < exonStarts.length; i++)
{
final long exonStart = exonStarts[i];
final long exonEnd = exonEnds[i];
final int exonStart = exonStarts[i];
final int exonEnd = exonEnds[i];
if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart
//this means cdsStart is within the current exon
result += (cdsStart - exonStart) + 1;
@ -1104,7 +1103,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
}
else //(negative strand)
{
final long cdsStart_5prime = cdsEnd;
final int cdsStart_5prime = cdsEnd;
if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) {
//the 1st nucleotide of the transcript is CDS.
return 1;
@ -1113,8 +1112,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
int result = 0;
for(int i = exonEnds.length - 1; i >= 0; i--)
{
final long exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i]
final long exonEnd = exonStarts[i];
final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i]
final int exonEnd = exonStarts[i];
if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart
//this means cdsStart is within the current exon
result += -(cdsStart_5prime - exonStart) + 1;