diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java index cfe921f8f..de16d6207 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/TranscriptToInfo.java @@ -20,8 +20,9 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.AnnotatorROD; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableCodec; +import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; @@ -52,7 +53,7 @@ import org.broadinstitute.sting.utils.StingException; */ @Reference(window=@Window(start=-4,stop=4)) @By(DataSource.REFERENCE) -@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorROD.class) }) +@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name="transcripts",type=AnnotatorInputTableFeature.class) } ) public class TranscriptToInfo extends RodWalker, TreeMap> implements TreeReducible> { private static final String ROD_NAME = "transcripts"; @@ -68,10 +69,10 @@ public class TranscriptToInfo extends RodWalker, TreeMap private final char[] ALLELES = {'A','C','G','T'}; /** Output columns */ - private final String OUTPUT_CHRPOS = AnnotatorROD.CHRPOS_COLUMN; - private final String OUTPUT_HAPLOTYPE_REFERENCE = AnnotatorROD.HAPLOTYPE_REFERENCE_COLUMN; - private final String OUTPUT_HAPLOTYPE_ALTERNATE = AnnotatorROD.HAPLOTYPE_ALTERNATE_COLUMN; - private final String OUTPUT_HAPLOTYPE_STRAND = AnnotatorROD.HAPLOTYPE_STRAND_COLUMN; + private final String OUTPUT_CHRPOS = GenomicAnnotation.CHRPOS_COLUMN; + private final String OUTPUT_HAPLOTYPE_REFERENCE = GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN; + private final String OUTPUT_HAPLOTYPE_ALTERNATE = GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN; + private final String OUTPUT_HAPLOTYPE_STRAND = GenomicAnnotation.HAPLOTYPE_STRAND_COLUMN; private final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true @@ -142,7 +143,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap final ArrayList header; try { - header = AnnotatorROD.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile()); + header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getReferenceOrderedData().getFile()); } catch(Exception e) { throw new StingException("Failed when attempting to read header from file: " + transcriptsDataSource.getReferenceOrderedData().getFile(), e); } @@ -183,7 +184,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap StringBuilder outputHeaderLine = new StringBuilder(); for( final String column : outputColumnNames ) { if(outputHeaderLine.length() != 0) { - outputHeaderLine.append( AnnotatorROD.DELIMITER ); + outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER ); } outputHeaderLine.append(column); } @@ -249,7 +250,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap for(Object transcriptRodObject : transcriptRODs) { //parse this ROD if it hasn't been already. - final AnnotatorROD transcriptRod = (AnnotatorROD) transcriptRodObject; + final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject; TranscriptTableRecord parsedTranscriptRod = (TranscriptTableRecord) transcriptRod.getTemporaryAttribute("parsedTranscriptRod"); if( parsedTranscriptRod == null ) { parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS); @@ -265,7 +266,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap } //populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence - final long position = ref.getLocus().getStart(); + final int position = (int) ref.getLocus().getStart(); if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) ) { //we're within an exon of a proteinCodingTranscript @@ -372,13 +373,13 @@ public class TranscriptToInfo extends RodWalker, TreeMap return; } - final long txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive - final long txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive + final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive + final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement final int strandSign = increment_5to3; //alias - final long cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive - final long cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive + final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive + final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive int frame = 0; //the frame of the current position int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand @@ -395,12 +396,12 @@ public class TranscriptToInfo extends RodWalker, TreeMap PositionType positionType = null; boolean isWithinIntronAndFarFromSpliceJunction = false; - long intronStart_5prime = -1; - long intronEnd_5prime = -1; + int intronStart_5prime = -1; + int intronEnd_5prime = -1; final Map outputLineFields = new HashMap(); - for(long txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) + for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) { ++totalPositionsCounter; @@ -466,8 +467,8 @@ public class TranscriptToInfo extends RodWalker, TreeMap //output intron record intronEnd_5prime = txCoord_5to3 - increment_5to3; - final long intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; - final long intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); + final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; + final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); outputLineFields.clear(); outputLineFields.put(OUTPUT_CHRPOS, parsedTranscriptRod.txChrom + ":" + intronStart + "-" + intronEnd); outputLineFields.put(OUTPUT_HAPLOTYPE_REFERENCE, Character.toString( '*' ) ); @@ -758,7 +759,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap final StringBuilder outputLine = new StringBuilder(); for( final String column : outputColumnNames ) { if(outputLine.length() != 0) { - outputLine.append( AnnotatorROD.DELIMITER ); + outputLine.append( AnnotatorInputTableCodec.DELIMITER ); } final String value = outputLineFields.get(column); if(value != null) { @@ -804,7 +805,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap * @param parsedTranscriptRod * @return */ - protected String computeSortKey(String txChrom, long position, char allele, TranscriptTableRecord parsedTranscriptRod) + protected String computeSortKey(String txChrom, int position, char allele, TranscriptTableRecord parsedTranscriptRod) { //compute chromosome sort key (this becomes the 1st column in the file and is later used to sort the output file using the GNU command-line sort utility) long key; @@ -909,17 +910,17 @@ public class TranscriptToInfo extends RodWalker, TreeMap public String[] geneNames; //eg. NM_021649 public String txChrom; //The chromosome name - public long txStart; - public long txEnd; + public int txStart; + public int txEnd; - public long cdsStart; - public long cdsEnd; + public int cdsStart; + public int cdsEnd; - public long[] exonStarts; - public long[] exonEnds; + public int[] exonStarts; + public int[] exonEnds; //public int[] exonFrames; - not used for anything, frame is computed another way - private AnnotatorROD rod; + private AnnotatorInputTableFeature rod; /** @@ -927,11 +928,11 @@ public class TranscriptToInfo extends RodWalker, TreeMap * * @param transcriptRod A rod representing a single record in the transcript table. */ - public TranscriptTableRecord(final AnnotatorROD transcriptRod, String[] geneNameColumns) { + public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) { this.rod = transcriptRod; //String binStr = transcriptRod.get("bin"); //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) - String strandStr = transcriptRod.get(STRAND_COLUMN); + String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN); if(strandStr == null) { throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\""); } else if(strandStr.equals("+")) { @@ -944,22 +945,20 @@ public class TranscriptToInfo extends RodWalker, TreeMap geneNames = new String[geneNameColumns.length]; for(int i = 0; i < geneNameColumns.length; i++) { - geneNames[i] = transcriptRod.get(geneNameColumns[i]); + geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]); } //String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop) //String txEndStr = transcriptRod.get(TXEND_COLUMN); + txChrom = transcriptRod.getChr(); + txStart = transcriptRod.getStart(); + txEnd = transcriptRod.getEnd(); - final GenomeLoc loc = transcriptRod.getLocation(); - txChrom = loc.getContig(); - txStart = loc.getStart(); - txEnd = loc.getStop(); + String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN); + String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN); - String cdsStartStr = transcriptRod.get(CDS_START_COLUMN); - String cdsEndStr = transcriptRod.get(CDS_END_COLUMN); - - cdsStart = Long.parseLong(cdsStartStr); - cdsEnd = Long.parseLong(cdsEndStr); + cdsStart = Integer.parseInt(cdsStartStr); + cdsEnd = Integer.parseInt(cdsEndStr); txSequence = new StringBuilder( (int) (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' if(isProteinCodingTranscript()) { @@ -967,9 +966,9 @@ public class TranscriptToInfo extends RodWalker, TreeMap cdsSequence = new StringBuilder( (int) (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns } - String exonCountStr = transcriptRod.get(EXON_COUNT_COLUMN); - String exonStartsStr = transcriptRod.get(EXON_STARTS_COLUMN); - String exonEndsStr = transcriptRod.get(EXON_ENDS_COLUMN); + String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN); + String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN); + String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN); //String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN); String[] exonStartStrs = exonStartsStr.split(","); @@ -982,12 +981,12 @@ public class TranscriptToInfo extends RodWalker, TreeMap throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod); } - exonStarts = new long[exonCount]; - exonEnds = new long[exonCount]; + exonStarts = new int[exonCount]; + exonEnds = new int[exonCount]; //exonFrames = new int[exonCount]; for(int i = 0; i < exonCount; i++) { - exonStarts[i] = Long.parseLong(exonStartStrs[i]); - exonEnds[i] = Long.parseLong(exonEndStrs[i]); + exonStarts[i] = Integer.parseInt(exonStartStrs[i]); + exonEnds[i] = Integer.parseInt(exonEndStrs[i]); //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); } } @@ -997,13 +996,13 @@ public class TranscriptToInfo extends RodWalker, TreeMap * Takes a genomic position on the same contig as the transcript, and * returns true if this position falls within an exon. */ - public boolean isWithinExon(final long genomPosition) { + public boolean isWithinExon(final int genomPosition) { for(int i = 0; i < exonStarts.length; i++) { - final long curStart = exonStarts[i]; + final int curStart = exonStarts[i]; if(genomPosition < curStart) { return false; } - final long curStop = exonEnds[i]; + final int curStop = exonEnds[i]; if(genomPosition <= curStop) { return true; } @@ -1017,10 +1016,10 @@ public class TranscriptToInfo extends RodWalker, TreeMap * The returned value is negative its on the 5' side (eg. upstream) of the juntion, and * positive if its on the 3' side. */ - public int computeDistanceToNearestSpliceSite(final long genomPosition) { + public int computeDistanceToNearestSpliceSite(final int genomPosition) { int prevDistance = Integer.MAX_VALUE; for(int i = 0; i < exonStarts.length; i++) { - final long curStart = exonStarts[i]; + final int curStart = exonStarts[i]; int curDistance = (int) (curStart-genomPosition); if(genomPosition < curStart) { //position is within the current intron @@ -1033,7 +1032,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap prevDistance = (int) (genomPosition - curStart) + 1; } - final long curStop = exonEnds[i]; + final int curStop = exonEnds[i]; curDistance = (int) (curStop-genomPosition) + 1; if(genomPosition <= curStop) { //position is within an exon @@ -1089,8 +1088,8 @@ public class TranscriptToInfo extends RodWalker, TreeMap int result = 0; for(int i = 0; i < exonStarts.length; i++) { - final long exonStart = exonStarts[i]; - final long exonEnd = exonEnds[i]; + final int exonStart = exonStarts[i]; + final int exonEnd = exonEnds[i]; if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart //this means cdsStart is within the current exon result += (cdsStart - exonStart) + 1; @@ -1104,7 +1103,7 @@ public class TranscriptToInfo extends RodWalker, TreeMap } else //(negative strand) { - final long cdsStart_5prime = cdsEnd; + final int cdsStart_5prime = cdsEnd; if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) { //the 1st nucleotide of the transcript is CDS. return 1; @@ -1113,8 +1112,8 @@ public class TranscriptToInfo extends RodWalker, TreeMap int result = 0; for(int i = exonEnds.length - 1; i >= 0; i--) { - final long exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] - final long exonEnd = exonStarts[i]; + final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] + final int exonEnd = exonStarts[i]; if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart //this means cdsStart is within the current exon result += -(cdsStart_5prime - exonStart) + 1;