diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java new file mode 100644 index 000000000..dbf2ff830 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.gatk.refdata.features.refseq; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.util.LineReader; +import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.Transcript; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * the ref seq codec + */ +public class RefSeqCodec implements FeatureCodec { + + @Override + public Feature decodeLoc(String line) { + String fields[] = line.split("\t"); + String contig_name = fields[2]; + return new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); + } + + /** Fills this object from a text line in RefSeq (UCSC) text dump file */ + @Override + public Feature decode(String line) { + String fields[] = line.split("\t"); + + String contig_name = fields[2]; + RefSeqFeature feature = new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); + + feature.setTranscript_id(fields[1]); + if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); + else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); + else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]); + + + feature.setTranscript_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + feature.setTranscript_coding_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]))); + feature.setGene_name(fields[12]); + String[] exon_starts = fields[9].split(","); + String[] exon_stops = fields[10].split(","); + String[] eframes = fields[15].split(","); + + assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ"; + assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ"; + + ArrayList exons = new ArrayList(exon_starts.length); + ArrayList exon_frames = new ArrayList(eframes.length); + + for ( int i = 0 ; i < exon_starts.length ; i++ ) { + exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exon_frames.add(Integer.decode(eframes[i])); + } + + feature.setExons(exons); + feature.setExon_frames(exon_frames); + return feature; + } + + @Override + public int readHeader(LineReader reader) { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public Class getFeatureType() { + return RefSeqCodec.class; + } + + @Override + public Object getHeader(Class clazz) throws ClassCastException { + return null; // we don't have a header + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java new file mode 100644 index 000000000..d593c4dc0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java @@ -0,0 +1,285 @@ +package org.broadinstitute.sting.gatk.refdata.features.refseq; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.refdata.Transcript; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * the ref seq feature + */ +public class RefSeqFeature implements Transcript, Feature { + + private String transcript_id; + private int strand; + private GenomeLoc transcript_interval; + private GenomeLoc transcript_coding_interval; + private List exons; + private String gene_name; + private List exon_frames; + private String name; + + // store the contig, start, and stop for this record + private final String contig; + private final int start; + private final int stop; + + public RefSeqFeature(String contig, int start, int stop) { + this.contig = contig; + this.start = start; + this.stop = stop; + } + + /** Returns id of the transcript (RefSeq NM_* id) */ + public String getTranscriptId() { return transcript_id; } + + /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ + public int getStrand() { return strand; } + + /** Returns transcript's full genomic interval (includes all exons with UTRs) */ + public GenomeLoc getLocation() { + if (transcript_interval == null) + transcript_interval = GenomeLocParser.parseGenomeLoc(contig,start,stop); + return transcript_interval; + } + + /** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */ + public GenomeLoc getCodingLocation() { return transcript_coding_interval; } + + /** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */ + public String getGeneName() { return gene_name; } + + /** Number of exons in this transcript */ + public int getNumExons() { return exons.size(); } + + /** Genomic location of the n-th exon; throws an exception if n is out of bounds */ + public GenomeLoc getExonLocation(int n) { + if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); + return exons.get(n); + } + + /** Returns the list of all exons in this transcript, as genomic intervals */ + public List getExons() { return exons; } + + /** Returns all exons falling ::entirely:: inside an interval **/ + public List getExonsInInterval( GenomeLoc interval ) { + List relevantExons = new ArrayList(exons.size()); + for ( GenomeLoc exon : getExons() ) { + if ( interval.containsP(exon) ) { + relevantExons.add(exon); + } + } + + return relevantExons; + } + + /** convenience method; returns the numbers of the exons in the interval **/ + public List getExonNumbersInInterval( GenomeLoc interval ) { + List numbers = new ArrayList(); + int iNo = 0; + for ( GenomeLoc exon : getExons() ) { + if ( interval.containsP(exon) ) { + numbers.add(iNo); + } + iNo++; + } + + return numbers; + } + + public String getTranscriptUniqueGeneName() { + return String.format("%s(%s)",getGeneName(),getTranscriptId()); + } + + public String getOverlapString(GenomeLoc position) { + boolean is_exon = false; + StringBuilder overlapString = new StringBuilder(); + int exonNo = 1; + + for ( GenomeLoc exon : exons ) { + if ( exon.containsP(position) ) { + overlapString.append(String.format("exon_%d",exonNo)); + is_exon = true; + break; + } + exonNo ++; + } + + if ( ! is_exon ) { + if ( overlapsCodingP(position) ) { + overlapString.append("Intron"); + } else { + overlapString.append("UTR"); + } + } + + return overlapString.toString(); + } + + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ + public boolean overlapsP (GenomeLoc that) { + return getLocation().overlapsP(that); + } + + /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. + * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, + * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. + * @see #overlapsExonP + */ + public boolean overlapsCodingP (GenomeLoc that) { + return transcript_coding_interval.overlapsP(that); + } + + /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ + public boolean overlapsExonP (GenomeLoc that) { + for ( GenomeLoc e : exons ) { + if ( e.overlapsP(that) ) return true; + } + return false; + } + public String toString() { + StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something + b.append(transcript_id); // #1 + b.append('\t'); + b.append(getLocation().getContig()); // #2 + b.append('\t'); + b.append( (strand==1?'+':'-') ); // #3 + b.append('\t'); + b.append( (getLocation().getStart() - 1) ); // #4 + b.append('\t'); + b.append( getLocation().getStop()); // #5 + b.append('\t'); + b.append( (transcript_coding_interval.getStart() - 1) ); // #6 + b.append('\t'); + b.append( transcript_coding_interval.getStop()); // #7 + b.append('\t'); + b.append(exons.size()); // #8 + b.append('\t'); + for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9 + b.append('\t'); + for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10 + b.append("\t0\t"); // # 11 - unused? + b.append(gene_name); // # 12 + b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused? + for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15 + + + return b.toString(); + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin an exon of any of those transcripts. Passing null is safe (will return false). + * NOTE: position can be still within a UTR, see #isCoding + * @return true if it's an exon + */ + public static boolean isExon(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; + } + return false; + + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin a coding region of any of those transcripts. + * Passing null is safe (will return false). + * NOTE: "coding" interval is defined as a single genomic interval, so it + * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a + * transcript, or internal exons that are not spliced into a given transcript. To check that a position is + * indeed within an exon but not in UTR, use #isCodingExon(). + * @return + */ + public static boolean isCoding(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true; + } + return false; + + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin a coding exon portion (i.e. true coding sequence) of any of those transcripts. + * Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript, + * for which the current position is within an exon and within a coding interval simultaneously. + * @return + */ + public static boolean isCodingExon(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) && ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; + } + return false; + + } + + + public void setTranscript_id(String transcript_id) { + this.transcript_id = transcript_id; + } + + public void setStrand(int strand) { + this.strand = strand; + } + + public void setTranscript_interval(GenomeLoc transcript_interval) { + this.transcript_interval = transcript_interval; + } + + public void setTranscript_coding_interval(GenomeLoc transcript_coding_interval) { + this.transcript_coding_interval = transcript_coding_interval; + } + + public void setExons(List exons) { + this.exons = exons; + } + + public void setGene_name(String gene_name) { + this.gene_name = gene_name; + } + + public void setExon_frames(List exon_frames) { + this.exon_frames = exon_frames; + } + + public void setName(String name) { + this.name = name; + } + + @Override + public String getChr() { + return contig; + } + + @Override + public int getStart() { + return start; + } + + @Override + public int getEnd() { + return stop; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index b90163846..c13f0700c 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -35,7 +35,7 @@ public class VCF4Codec implements FeatureCodec { private static Map> alleleMap = new HashMap>(3); // cache the genotyope values - private static String[] CachedGTValues = new String[100]; + private static String[] GTValueArray = new String[100]; // for performance testing purposes public static boolean validate = true; @@ -56,7 +56,7 @@ public class VCF4Codec implements FeatureCodec { ArrayList filterFields = new ArrayList(); // do we want to validate the info, format, and filter fields - private final boolean validateFromHeader = true; + private final boolean validateFromHeader = false; // we store a name to give to each of the variant contexts we emit private String name = "Unkown"; @@ -377,49 +377,72 @@ public class VCF4Codec implements FeatureCodec { // do we have genotyping data if (parts.length > 8) { - String GT = parts[8]; int genotypesStart = 9; - // parse genotypes - int nGTKeys = ParsingUtils.split(GT, genotypeKeyArray, ':'); - genotypes = new HashMap(Math.max(parts.length - genotypesStart, 1)); - Iterator iter = header.getGenotypeSamples().iterator(); - - alleleMap.clear(); - for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) { - String sample = parts[genotypeOffset]; - String[] GTValues = CachedGTValues; - ParsingUtils.split(sample, GTValues, ':'); - List genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap); - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; - Set genotypeFilters = null; - - // todo -- the parsing of attributes could be made lazy for performance - Map gtAttributes = null; - if (nGTKeys > 1) { - gtAttributes = new HashMap(nGTKeys - 1); - for (int i = 1; i < nGTKeys; i++) { - if (genotypeKeyArray[i].equals("GQ")) { - GTQual = parseQual(GTValues[i]); - } - if (genotypeKeyArray[i].equals("FL")) { // deal with genotype filters here - genotypeFilters.addAll(parseFilters(GTValues[i])); - } else { - gtAttributes.put(genotypeKeyArray[i], GTValues[i]); - } - } - // validate the format fields - validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); - } - - boolean phased = genotypeKeyArray[0].charAt(1) == '|'; - - Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); - genotypes.put(g.getSampleName(), g); - } + genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart); } return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); } + /** + * create a genotype map + * @param parts the string parts + * @param locAndAlleles the locations and the list of alleles + * @param genotypesStart the position in the parts array that the genotype strings start + * @return a mapping of sample name to genotype object + */ + private Map createGenotypeMap(String[] parts, Pair> locAndAlleles, int genotypesStart) { + Map genotypes = new LinkedHashMap(Math.max(parts.length - genotypesStart, 1)); + + // get the format keys + int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':'); + + // cycle through the sample names + Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); + + // clear out our allele mapping + alleleMap.clear(); + + // cycle through the genotype strings + for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) { + int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':'); + List genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap); + double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + Set genotypeFilters = null; + String sampleName = sampleNameIterator.next(); + + + // todo -- the parsing of attributes could be made lazy for performance + Map gtAttributes = null; + + // check to see if the value list is longer than the key list, which is a problem + if (nGTKeys < GTValueSplitSize) + throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]); + + if (nGTKeys > 1) { + gtAttributes = new HashMap(nGTKeys - 1); + for (int i = 1; i < nGTKeys; i++) { + if (i >= GTValueSplitSize) + gtAttributes.put(genotypeKeyArray[i],"."); + else if (genotypeKeyArray[i].equals("GQ")) + GTQual = parseQual(GTValueArray[i]); + else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here + genotypeFilters.addAll(parseFilters(GTValueArray[i])); + else + gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]); + + } + // validate the format fields + validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); + } + + boolean phased = genotypeKeyArray[0].charAt(1) == '|'; + + Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); + genotypes.put(g.getSampleName(), g); + } + return genotypes; + } + /** * clip the alleles, based on the reference * @param unclippedAlleles the list of alleles diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java deleted file mode 100644 index 87c133c8f..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java +++ /dev/null @@ -1,252 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.StingException; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 22, 2009 - * Time: 3:19:41 PM - * To change this template use File | Settings | File Templates. - */ -public class rodRefSeq extends BasicReferenceOrderedDatum implements Transcript { - - private String transcript_id; - private int strand; - private GenomeLoc transcript_interval; - private GenomeLoc transcript_coding_interval; - private List exons; - private String gene_name; - private List exon_frames; - - - public rodRefSeq(String name) { - super(name); - } - - /** Returns id of the transcript (RefSeq NM_* id) */ - public String getTranscriptId() { return transcript_id; } - /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ - public int getStrand() { return strand; } - /** Returns transcript's full genomic interval (includes all exons with UTRs) */ - public GenomeLoc getLocation() { return transcript_interval; } - /** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */ - public GenomeLoc getCodingLocation() { return transcript_coding_interval; } - /** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */ - public String getGeneName() { return gene_name; } - /** Number of exons in this transcript */ - public int getNumExons() { return exons.size(); } - /** Genomic location of the n-th exon; throws an exception if n is out of bounds */ - public GenomeLoc getExonLocation(int n) { - if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); - return exons.get(n); - } - /** Returns the list of all exons in this transcript, as genomic intervals */ - public List getExons() { return exons; } - - /** Returns all exons falling ::entirely:: inside an interval **/ - public List getExonsInInterval( GenomeLoc interval ) { - List relevantExons = new ArrayList(exons.size()); - for ( GenomeLoc exon : getExons() ) { - if ( interval.containsP(exon) ) { - relevantExons.add(exon); - } - } - - return relevantExons; - } - - /** convenience method; returns the numbers of the exons in the interval **/ - public List getExonNumbersInInterval( GenomeLoc interval ) { - List numbers = new ArrayList(); - int iNo = 0; - for ( GenomeLoc exon : getExons() ) { - if ( interval.containsP(exon) ) { - numbers.add(iNo); - } - iNo++; - } - - return numbers; - } - - public String getTranscriptUniqueGeneName() { - return String.format("%s(%s)",getGeneName(),getTranscriptId()); - } - - public String getOverlapString(GenomeLoc position) { - boolean is_exon = false; - StringBuilder overlapString = new StringBuilder(); - int exonNo = 1; - - for ( GenomeLoc exon : exons ) { - if ( exon.containsP(position) ) { - overlapString.append(String.format("exon_%d",exonNo)); - is_exon = true; - break; - } - exonNo ++; - } - - if ( ! is_exon ) { - if ( overlapsCodingP(position) ) { - overlapString.append("Intron"); - } else { - overlapString.append("UTR"); - } - } - - return overlapString.toString(); - } - - /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ - public boolean overlapsP (GenomeLoc that) { - return transcript_interval.overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. - * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, - * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. - * @see #overlapsExonP - */ - public boolean overlapsCodingP (GenomeLoc that) { - return transcript_coding_interval.overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ - public boolean overlapsExonP (GenomeLoc that) { - for ( GenomeLoc e : exons ) { - if ( e.overlapsP(that) ) return true; - } - return false; - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - @Override - public boolean parseLine(final Object header, String[] fields) { - transcript_id = fields[1]; - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1; - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1; - else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]); - - String contig_name = fields[2]; - transcript_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); - transcript_coding_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])); - gene_name = fields[12]; - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ"; - assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ"; - - exons = new ArrayList(exon_starts.length); - exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - return true; - } - - public String toString() { - StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something - b.append(transcript_id); // #1 - b.append('\t'); - b.append(transcript_interval.getContig()); // #2 - b.append('\t'); - b.append( (strand==1?'+':'-') ); // #3 - b.append('\t'); - b.append( (transcript_interval.getStart() - 1) ); // #4 - b.append('\t'); - b.append( transcript_interval.getStop()); // #5 - b.append('\t'); - b.append( (transcript_coding_interval.getStart() - 1) ); // #6 - b.append('\t'); - b.append( transcript_coding_interval.getStop()); // #7 - b.append('\t'); - b.append(exons.size()); // #8 - b.append('\t'); - for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9 - b.append('\t'); - for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10 - b.append("\t0\t"); // # 11 - unused? - b.append(gene_name); // # 12 - b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused? - for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15 - - - return b.toString(); - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin an exon of any of those transcripts. Passing null is safe (will return false). - * NOTE: position can be still within a UTR, see #isCoding - * @return true if it's an exon - */ - public static boolean isExon(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; - } - return false; - - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin a coding region of any of those transcripts. - * Passing null is safe (will return false). - * NOTE: "coding" interval is defined as a single genomic interval, so it - * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a - * transcript, or internal exons that are not spliced into a given transcript. To check that a position is - * indeed within an exon but not in UTR, use #isCodingExon(). - * @return - */ - public static boolean isCoding(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true; - } - return false; - - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin a coding exon portion (i.e. true coding sequence) of any of those transcripts. - * Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript, - * for which the current position is within an exon and within a coding interval simultaneously. - * @return - */ - public static boolean isCodingExon(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) && ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; - } - return false; - - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java index 8041abc9d..f795ef8a8 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java @@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder { static { // All known ROD types Types.put("GELI", rodGELI.class); - Types.put("RefSeq", rodRefSeq.class); Types.put("Table", TabularROD.class); Types.put("HapMap", HapMapROD.class); Types.put("Intervals", IntervalRod.class); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java index c4b3a7947..5b5ab287c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java @@ -29,11 +29,18 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broad.tribble.FeatureReader; +import org.broad.tribble.dbsnp.DbSNPCodec; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; @@ -46,6 +53,7 @@ import org.broadinstitute.sting.utils.collections.CircularArray; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.commandline.Argument; +import java.io.File; import java.io.IOException; import java.util.*; @@ -136,11 +144,15 @@ public class IndelGenotyperV2Walker extends ReadWalker { normal_context = new WindowContext(0,WINDOW_SIZE); if ( RefseqFileName != null ) { - ReferenceOrderedData refseq = new ReferenceOrderedData("refseq", - new java.io.File(RefseqFileName), rodRefSeq.class); + TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first; - refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator())); - logger.info("Using RefSeq annotations from "+RefseqFileName); + try { + refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq")); + } catch (IOException e) { + throw new StingException("Unable to open file " + RefseqFileName, e); + } + logger.info("Using RefSeq annotations from "+RefseqFileName); } if ( refseqIterator == null ) logger.info("No annotations available"); @@ -627,11 +639,11 @@ public class IndelGenotyperV2Walker extends ReadWalker { else { StringBuilder b = new StringBuilder(); - if ( rodRefSeq.isExon(ann) ) { - if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence + if ( RefSeqFeature.isExon(ann) ) { + if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence else b.append(annUTR); // exon but not coding = UTR } else { - if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron + if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... } b.append('\t'); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java index c8992f416..136529a52 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; @@ -44,13 +45,26 @@ import java.util.LinkedList; * Prints out counts of the number of reference ordered data objects are * each locus for debugging RodWalkers. */ -public class CountRodWalker extends RodWalker, Long>> { +public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> { @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) public boolean showSkipped = false; + @Override + public Pair, Long> treeReduce(Pair, Long> lhs, Pair, Long> rhs) { + ExpandingArrayList nt = new ExpandingArrayList(); + nt.addAll(lhs.first); + int index = 0; + for (Long l : rhs.first) + if (nt.get(index) == null) + nt.add(l); + else + nt.set(index,nt.get(index) + l); + return new Pair, Long>(nt, lhs.second + rhs.second); + } + public class Datum { public long nRodsAtThisLocation = 0; public long nSkippedBases =0; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java index c624c2347..5a86e8e41 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java @@ -1,21 +1,15 @@ package org.broadinstitute.sting.oneoffprojects.walkers; import org.broad.tribble.bed.BEDFeature; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.genotype.vcf.*; import java.util.*; @@ -31,7 +25,7 @@ import java.util.*; public class DesignFileGeneratorWalker extends RodWalker { private HashMap intervalBuffer = new HashMap(); - private HashSet refseqBuffer = new HashSet(); + private HashSet refseqBuffer = new HashSet(); private HashMap currentBedFeatures = new HashMap(); public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -59,8 +53,8 @@ public class DesignFileGeneratorWalker extends RodWalker { if ( refseqList != null && refseqList.size() > 0 ) { for ( Object seq : refseqList ) { - if ( ! refseqBuffer.contains( (rodRefSeq) seq) ) { - refseqBuffer.add( (rodRefSeq) seq ); + if ( ! refseqBuffer.contains( (RefSeqFeature) seq) ) { + refseqBuffer.add( (RefSeqFeature) seq ); } } } @@ -84,7 +78,7 @@ public class DesignFileGeneratorWalker extends RodWalker { private long process() { long nUpdate = 0l; for ( GenomeLoc interval : intervalBuffer.keySet() ) { - for ( rodRefSeq refseq : refseqBuffer ) { + for ( RefSeqFeature refseq : refseqBuffer ) { if ( interval.overlapsP(refseq.getLocation()) && ! intervalBuffer.get(interval).geneNames.contains(refseq.getTranscriptUniqueGeneName()) ) { // if the interval overlaps the gene transcript; and the gene is not already represented in the interval @@ -119,14 +113,14 @@ public class DesignFileGeneratorWalker extends RodWalker { * @return diddly */ public void cleanup(ReferenceContext ref) { - List toRemove = new ArrayList(); - for ( rodRefSeq refseq : refseqBuffer ) { + List toRemove = new ArrayList(); + for ( RefSeqFeature refseq : refseqBuffer ) { if ( refseq.getLocation().isBefore(ref.getLocus()) ) { toRemove.add(refseq); } } - for ( rodRefSeq refseq : toRemove ) { + for ( RefSeqFeature refseq : toRemove ) { refseqBuffer.remove(refseq); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java index 7d0817c19..c3feb4030 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.oneoffprojects.walkers; +import org.broad.tribble.FeatureReader; import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeaderLine; import org.broad.tribble.vcf.VCFInfoHeaderLine; @@ -8,6 +9,10 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RodWalker; @@ -15,6 +20,8 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.vcf.*; +import java.io.File; +import java.io.IOException; import java.util.*; /** @@ -42,11 +49,11 @@ public class IndelAnnotator extends RodWalker{ else { StringBuilder b = new StringBuilder(); - if ( rodRefSeq.isExon(ann) ) { - if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence + if ( RefSeqFeature.isExon(ann) ) { + if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence else b.append(annUTR); // exon but not coding = UTR } else { - if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron + if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... } b.append('\t'); @@ -57,10 +64,15 @@ public class IndelAnnotator extends RodWalker{ public void initialize() { if ( RefseqFileName != null ) { - ReferenceOrderedData refseq = new ReferenceOrderedData("refseq", - new java.io.File(RefseqFileName), rodRefSeq.class); + TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first; + + try { + refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq")); + } catch (IOException e) { + throw new StingException("Unable to open file " + RefseqFileName, e); + } - refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator())); logger.info("Using RefSeq annotations from "+RefseqFileName); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java index b5777634e..b7b086394 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java @@ -31,21 +31,22 @@ import net.sf.picard.util.IntervalList; import net.sf.picard.util.OverlapDetector; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; +import org.broad.tribble.FeatureReader; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.rodRefSeq; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.*; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; @@ -108,10 +109,15 @@ public class HybSelPerformanceWalker extends LocusWalker refseq = new ReferenceOrderedData("refseq", - new java.io.File(REFSEQ_FILE), rodRefSeq.class); + TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class, new File(REFSEQ_FILE)).first; + + try { + refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(), "refseq")); + } catch (IOException e) { + throw new StingException("Unable to open file " + REFSEQ_FILE, e); + } - refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator())); logger.info("Using RefSeq annotations from "+REFSEQ_FILE); } @@ -282,8 +288,8 @@ public class HybSelPerformanceWalker extends LocusWalker