Updated to use Tribble-based GATKFeature instead of TabularROD

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3389 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
weisburd 2010-05-19 03:41:37 +00:00
parent d8469e2fba
commit 42ee16f256
1 changed files with 57 additions and 58 deletions

View File

@ -20,8 +20,9 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.AnnotatorROD;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableCodec;
import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.DataSource;
@ -52,7 +53,7 @@ import org.broadinstitute.sting.utils.StingException;
*/ */
@Reference(window=@Window(start=-4,stop=4)) @Reference(window=@Window(start=-4,stop=4))
@By(DataSource.REFERENCE) @By(DataSource.REFERENCE)
@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorROD.class) }) @Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorInputTableFeature.class) } )
public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap<String, String>> implements TreeReducible<TreeMap<String, String>> public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap<String, String>> implements TreeReducible<TreeMap<String, String>>
{ {
private static final String ROD_NAME = "transcripts"; private static final String ROD_NAME = "transcripts";
@ -68,10 +69,10 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
private final char[] ALLELES = {'A','C','G','T'}; private final char[] ALLELES = {'A','C','G','T'};
/** Output columns */ /** Output columns */
private final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN; private final String OUTPUT_CHRPOS = GenomicAnnotation.CHRPOS_COLUMN;
private final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN; private final String OUTPUT_HAPLOTYPE_REFERENCE = GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN;
private final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN; private final String OUTPUT_HAPLOTYPE_ALTERNATE = GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN;
private final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN; private final String OUTPUT_HAPLOTYPE_STRAND = GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN;
private final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true private final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true
@ -142,7 +143,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
final ArrayList<String> header; final ArrayList<String> header;
try { try {
header = AnnotatorROD.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile()); header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile());
} catch(Exception e) { } catch(Exception e) {
throw new StingException("Failed when attempting to read header from file: " + transcriptsDataSource.getReferenceOrderedData().getFile(), e); throw new StingException("Failed when attempting to read header from file: " + transcriptsDataSource.getReferenceOrderedData().getFile(), e);
} }
@ -183,7 +184,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
StringBuilder outputHeaderLine = new StringBuilder(); StringBuilder outputHeaderLine = new StringBuilder();
for( final String column : outputColumnNames ) { for( final String column : outputColumnNames ) {
if(outputHeaderLine.length() != 0) { if(outputHeaderLine.length() != 0) {
outputHeaderLine.append( AnnotatorROD.DELIMITER ); outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER );
} }
outputHeaderLine.append(column); outputHeaderLine.append(column);
} }
@ -249,7 +250,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
for(Object transcriptRodObject : transcriptRODs) for(Object transcriptRodObject : transcriptRODs)
{ {
//parse this ROD if it hasn't been already. //parse this ROD if it hasn't been already.
final AnnotatorROD transcriptRod = (AnnotatorROD) transcriptRodObject; final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject;
TranscriptTableRecord parsedTranscriptRod = (TranscriptTableRecord) transcriptRod.getTemporaryAttribute("parsedTranscriptRod"); TranscriptTableRecord parsedTranscriptRod = (TranscriptTableRecord) transcriptRod.getTemporaryAttribute("parsedTranscriptRod");
if( parsedTranscriptRod == null ) { if( parsedTranscriptRod == null ) {
parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS); parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS);
@ -265,7 +266,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
} }
//populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence //populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence
final long position = ref.getLocus().getStart(); final int position = (int) ref.getLocus().getStart();
if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) ) if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) )
{ {
//we're within an exon of a proteinCodingTranscript //we're within an exon of a proteinCodingTranscript
@ -372,13 +373,13 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
return; return;
} }
final long txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive
final long txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive
final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement
final int strandSign = increment_5to3; //alias final int strandSign = increment_5to3; //alias
final long cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive
final long cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive
int frame = 0; //the frame of the current position int frame = 0; //the frame of the current position
int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand
@ -395,12 +396,12 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
PositionType positionType = null; PositionType positionType = null;
boolean isWithinIntronAndFarFromSpliceJunction = false; boolean isWithinIntronAndFarFromSpliceJunction = false;
long intronStart_5prime = -1; int intronStart_5prime = -1;
long intronEnd_5prime = -1; int intronEnd_5prime = -1;
final Map<String, String> outputLineFields = new HashMap<String, String>(); final Map<String, String> outputLineFields = new HashMap<String, String>();
for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3)
{ {
++totalPositionsCounter; ++totalPositionsCounter;
@ -466,8 +467,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
//output intron record //output intron record
intronEnd_5prime = txCoord_5to3 - increment_5to3; intronEnd_5prime = txCoord_5to3 - increment_5to3;
final long intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ;
final long intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime);
outputLineFields.clear(); outputLineFields.clear();
outputLineFields.put(OUTPUT_CHRPOS, parsedTranscriptRod.txChrom + ":" + intronStart + "-" + intronEnd); outputLineFields.put(OUTPUT_CHRPOS, parsedTranscriptRod.txChrom + ":" + intronStart + "-" + intronEnd);
outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) ); outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) );
@ -758,7 +759,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
final StringBuilder outputLine = new StringBuilder(); final StringBuilder outputLine = new StringBuilder();
for( final String column : outputColumnNames ) { for( final String column : outputColumnNames ) {
if(outputLine.length() != 0) { if(outputLine.length() != 0) {
outputLine.append( AnnotatorROD.DELIMITER ); outputLine.append( AnnotatorInputTableCodec.DELIMITER );
} }
final String value = outputLineFields.get(column); final String value = outputLineFields.get(column);
if(value != null) { if(value != null) {
@ -804,7 +805,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* @param parsedTranscriptRod * @param parsedTranscriptRod
* @return * @return
*/ */
protected String computeSortKey(String txChrom, long position, char allele, TranscriptTableRecord parsedTranscriptRod) protected String computeSortKey(String txChrom, int position, char allele, TranscriptTableRecord parsedTranscriptRod)
{ {
//compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility) //compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility)
long key; long key;
@ -909,17 +910,17 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
public String[] geneNames; //eg. NM_021649 public String[] geneNames; //eg. NM_021649
public String txChrom; //The chromosome name public String txChrom; //The chromosome name
public long txStart; public int txStart;
public long txEnd; public int txEnd;
public long cdsStart; public int cdsStart;
public long cdsEnd; public int cdsEnd;
public long[] exonStarts; public int[] exonStarts;
public long[] exonEnds; public int[] exonEnds;
//public int[] exonFrames; - not used for anything, frame is computed another way //public int[] exonFrames; - not used for anything, frame is computed another way
private AnnotatorROD rod; private AnnotatorInputTableFeature rod;
/** /**
@ -927,11 +928,11 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* *
* @param transcriptRod A rod representing a single record in the transcript table. * @param transcriptRod A rod representing a single record in the transcript table.
*/ */
public TranscriptTableRecord(final AnnotatorROD transcriptRod, String[] geneNameColumns) { public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) {
this.rod = transcriptRod; this.rod = transcriptRod;
//String binStr = transcriptRod.get("bin"); //String binStr = transcriptRod.get("bin");
//String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated )
String strandStr = transcriptRod.get(STRAND_COLUMN); String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN);
if(strandStr == null) { if(strandStr == null) {
throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\""); throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\"");
} else if(strandStr.equals("+")) { } else if(strandStr.equals("+")) {
@ -944,22 +945,20 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
geneNames = new String[geneNameColumns.length]; geneNames = new String[geneNameColumns.length];
for(int i = 0; i < geneNameColumns.length; i++) { for(int i = 0; i < geneNameColumns.length; i++) {
geneNames[i] = transcriptRod.get(geneNameColumns[i]); geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]);
} }
//String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop) //String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop)
//String txEndStr = transcriptRod.get(TXEND_COLUMN); //String txEndStr = transcriptRod.get(TXEND_COLUMN);
txChrom = transcriptRod.getChr();
txStart = transcriptRod.getStart();
txEnd = transcriptRod.getEnd();
final GenomeLoc loc = transcriptRod.getLocation(); String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN);
txChrom = loc.getContig(); String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN);
txStart = loc.getStart();
txEnd = loc.getStop();
String cdsStartStr = transcriptRod.get(CDS_START_COLUMN); cdsStart = Integer.parseInt(cdsStartStr);
String cdsEndStr = transcriptRod.get(CDS_END_COLUMN); cdsEnd = Integer.parseInt(cdsEndStr);
cdsStart = Long.parseLong(cdsStartStr);
cdsEnd = Long.parseLong(cdsEndStr);
txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3'
if(isProteinCodingTranscript()) { if(isProteinCodingTranscript()) {
@ -967,9 +966,9 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns
} }
String exonCountStr = transcriptRod.get(EXON_COUNT_COLUMN); String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN);
String exonStartsStr = transcriptRod.get(EXON_STARTS_COLUMN); String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN);
String exonEndsStr = transcriptRod.get(EXON_ENDS_COLUMN); String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN);
//String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN); //String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN);
String[] exonStartStrs = exonStartsStr.split(","); String[] exonStartStrs = exonStartsStr.split(",");
@ -982,12 +981,12 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod); throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod);
} }
exonStarts = new long[exonCount]; exonStarts = new int[exonCount];
exonEnds = new long[exonCount]; exonEnds = new int[exonCount];
//exonFrames = new int[exonCount]; //exonFrames = new int[exonCount];
for(int i = 0; i < exonCount; i++) { for(int i = 0; i < exonCount; i++) {
exonStarts[i] = Long.parseLong(exonStartStrs[i]); exonStarts[i] = Integer.parseInt(exonStartStrs[i]);
exonEnds[i] = Long.parseLong(exonEndStrs[i]); exonEnds[i] = Integer.parseInt(exonEndStrs[i]);
//exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]);
} }
} }
@ -997,13 +996,13 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* Takes a genomic position on the same contig as the transcript, and * Takes a genomic position on the same contig as the transcript, and
* returns true if this position falls within an exon. * returns true if this position falls within an exon.
*/ */
public boolean isWithinExon(final long genomPosition) { public boolean isWithinExon(final int genomPosition) {
for(int i = 0; i < exonStarts.length; i++) { for(int i = 0; i < exonStarts.length; i++) {
final long curStart = exonStarts[i]; final int curStart = exonStarts[i];
if(genomPosition < curStart) { if(genomPosition < curStart) {
return false; return false;
} }
final long curStop = exonEnds[i]; final int curStop = exonEnds[i];
if(genomPosition <= curStop) { if(genomPosition <= curStop) {
return true; return true;
} }
@ -1017,10 +1016,10 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
* The returned value is negative its on the 5' side (eg. upstream) of the juntion, and * The returned value is negative its on the 5' side (eg. upstream) of the juntion, and
* positive if its on the 3' side. * positive if its on the 3' side.
*/ */
public int computeDistanceToNearestSpliceSite(final long genomPosition) { public int computeDistanceToNearestSpliceSite(final int genomPosition) {
int prevDistance = Integer.MAX_VALUE; int prevDistance = Integer.MAX_VALUE;
for(int i = 0; i < exonStarts.length; i++) { for(int i = 0; i < exonStarts.length; i++) {
final long curStart = exonStarts[i]; final int curStart = exonStarts[i];
int curDistance = (int) (curStart-genomPosition); int curDistance = (int) (curStart-genomPosition);
if(genomPosition < curStart) { if(genomPosition < curStart) {
//position is within the current intron //position is within the current intron
@ -1033,7 +1032,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
prevDistance = (int) (genomPosition - curStart) + 1; prevDistance = (int) (genomPosition - curStart) + 1;
} }
final long curStop = exonEnds[i]; final int curStop = exonEnds[i];
curDistance = (int) (curStop-genomPosition) + 1; curDistance = (int) (curStop-genomPosition) + 1;
if(genomPosition <= curStop) { if(genomPosition <= curStop) {
//position is within an exon //position is within an exon
@ -1089,8 +1088,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
int result = 0; int result = 0;
for(int i = 0; i < exonStarts.length; i++) for(int i = 0; i < exonStarts.length; i++)
{ {
final long exonStart = exonStarts[i]; final int exonStart = exonStarts[i];
final long exonEnd = exonEnds[i]; final int exonEnd = exonEnds[i];
if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart
//this means cdsStart is within the current exon //this means cdsStart is within the current exon
result += (cdsStart - exonStart) + 1; result += (cdsStart - exonStart) + 1;
@ -1104,7 +1103,7 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
} }
else //(negative strand) else //(negative strand)
{ {
final long cdsStart_5prime = cdsEnd; final int cdsStart_5prime = cdsEnd;
if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) { if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) {
//the 1st nucleotide of the transcript is CDS. //the 1st nucleotide of the transcript is CDS.
return 1; return 1;
@ -1113,8 +1112,8 @@ public class TranscriptToInfo extends RodWalker<TreeMap<String, String>, TreeMap
int result = 0; int result = 0;
for(int i = exonEnds.length - 1; i >= 0; i--) for(int i = exonEnds.length - 1; i >= 0; i--)
{ {
final long exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i]
final long exonEnd = exonStarts[i]; final int exonEnd = exonStarts[i];
if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart
//this means cdsStart is within the current exon //this means cdsStart is within the current exon
result += -(cdsStart_5prime - exonStart) + 1; result += -(cdsStart_5prime - exonStart) + 1;