Two fixes together:

1) Some improvements to the VCF4 parsing, including disabling validation.
2) Reimplemented RefSeq in the new Tribble-style rod system.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3630 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-24 22:17:03 +00:00
parent 72e669538e
commit 682f9b46c6
11 changed files with 506 additions and 333 deletions

View File

@ -0,0 +1,80 @@
package org.broadinstitute.sting.gatk.refdata.features.refseq;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.util.LineReader;
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.Transcript;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.util.ArrayList;
import java.util.List;
/**
* the ref seq codec
*/
public class RefSeqCodec implements FeatureCodec {
@Override
public Feature decodeLoc(String line) {
String fields[] = line.split("\t");
String contig_name = fields[2];
return new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
}
/** Fills this object from a text line in RefSeq (UCSC) text dump file */
@Override
public Feature decode(String line) {
String fields[] = line.split("\t");
String contig_name = fields[2];
RefSeqFeature feature = new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
feature.setTranscript_id(fields[1]);
if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1);
else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1);
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
feature.setTranscript_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
feature.setTranscript_coding_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])));
feature.setGene_name(fields[12]);
String[] exon_starts = fields[9].split(",");
String[] exon_stops = fields[10].split(",");
String[] eframes = fields[15].split(",");
assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
ArrayList<GenomeLoc> exons = new ArrayList<GenomeLoc>(exon_starts.length);
ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
exon_frames.add(Integer.decode(eframes[i]));
}
feature.setExons(exons);
feature.setExon_frames(exon_frames);
return feature;
}
@Override
public int readHeader(LineReader reader) {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public Class getFeatureType() {
return RefSeqCodec.class;
}
@Override
public Object getHeader(Class clazz) throws ClassCastException {
return null; // we don't have a header
}
}

View File

@ -0,0 +1,285 @@
package org.broadinstitute.sting.gatk.refdata.features.refseq;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.refdata.Transcript;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.util.ArrayList;
import java.util.List;
/**
* the ref seq feature
*/
public class RefSeqFeature implements Transcript, Feature {
private String transcript_id;
private int strand;
private GenomeLoc transcript_interval;
private GenomeLoc transcript_coding_interval;
private List<GenomeLoc> exons;
private String gene_name;
private List<Integer> exon_frames;
private String name;
// store the contig, start, and stop for this record
private final String contig;
private final int start;
private final int stop;
public RefSeqFeature(String contig, int start, int stop) {
this.contig = contig;
this.start = start;
this.stop = stop;
}
/** Returns id of the transcript (RefSeq NM_* id) */
public String getTranscriptId() { return transcript_id; }
/** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
public int getStrand() { return strand; }
/** Returns transcript's full genomic interval (includes all exons with UTRs) */
public GenomeLoc getLocation() {
if (transcript_interval == null)
transcript_interval = GenomeLocParser.parseGenomeLoc(contig,start,stop);
return transcript_interval;
}
/** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */
public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
/** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */
public String getGeneName() { return gene_name; }
/** Number of exons in this transcript */
public int getNumExons() { return exons.size(); }
/** Genomic location of the n-th exon; throws an exception if n is out of bounds */
public GenomeLoc getExonLocation(int n) {
if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
return exons.get(n);
}
/** Returns the list of all exons in this transcript, as genomic intervals */
public List<GenomeLoc> getExons() { return exons; }
/** Returns all exons falling ::entirely:: inside an interval **/
public List<GenomeLoc> getExonsInInterval( GenomeLoc interval ) {
List<GenomeLoc> relevantExons = new ArrayList<GenomeLoc>(exons.size());
for ( GenomeLoc exon : getExons() ) {
if ( interval.containsP(exon) ) {
relevantExons.add(exon);
}
}
return relevantExons;
}
/** convenience method; returns the numbers of the exons in the interval **/
public List<Integer> getExonNumbersInInterval( GenomeLoc interval ) {
List<Integer> numbers = new ArrayList<Integer>();
int iNo = 0;
for ( GenomeLoc exon : getExons() ) {
if ( interval.containsP(exon) ) {
numbers.add(iNo);
}
iNo++;
}
return numbers;
}
public String getTranscriptUniqueGeneName() {
return String.format("%s(%s)",getGeneName(),getTranscriptId());
}
public String getOverlapString(GenomeLoc position) {
boolean is_exon = false;
StringBuilder overlapString = new StringBuilder();
int exonNo = 1;
for ( GenomeLoc exon : exons ) {
if ( exon.containsP(position) ) {
overlapString.append(String.format("exon_%d",exonNo));
is_exon = true;
break;
}
exonNo ++;
}
if ( ! is_exon ) {
if ( overlapsCodingP(position) ) {
overlapString.append("Intron");
} else {
overlapString.append("UTR");
}
}
return overlapString.toString();
}
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
public boolean overlapsP (GenomeLoc that) {
return getLocation().overlapsP(that);
}
/** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
* NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
* but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
* @see #overlapsExonP
*/
public boolean overlapsCodingP (GenomeLoc that) {
return transcript_coding_interval.overlapsP(that);
}
/** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
public boolean overlapsExonP (GenomeLoc that) {
for ( GenomeLoc e : exons ) {
if ( e.overlapsP(that) ) return true;
}
return false;
}
public String toString() {
StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something
b.append(transcript_id); // #1
b.append('\t');
b.append(getLocation().getContig()); // #2
b.append('\t');
b.append( (strand==1?'+':'-') ); // #3
b.append('\t');
b.append( (getLocation().getStart() - 1) ); // #4
b.append('\t');
b.append( getLocation().getStop()); // #5
b.append('\t');
b.append( (transcript_coding_interval.getStart() - 1) ); // #6
b.append('\t');
b.append( transcript_coding_interval.getStop()); // #7
b.append('\t');
b.append(exons.size()); // #8
b.append('\t');
for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9
b.append('\t');
for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10
b.append("\t0\t"); // # 11 - unused?
b.append(gene_name); // # 12
b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused?
for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15
return b.toString();
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin an exon of <i>any</i> of those transcripts. Passing null is safe (will return false).
* NOTE: position can be still within a UTR, see #isCoding
* @return true if it's an exon
*/
public static boolean isExon(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
}
return false;
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin a coding region of <i>any</i> of those transcripts.
* Passing null is safe (will return false).
* NOTE: "coding" interval is defined as a single genomic interval, so it
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
* transcript, or internal exons that are not spliced into a given transcript. To check that a position is
* indeed within an exon but not in UTR, use #isCodingExon().
* @return
*/
public static boolean isCoding(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true;
}
return false;
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin a coding exon portion (i.e. true coding sequence) of <i>any</i> of those transcripts.
* Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript,
* for which the current position is within an exon <i>and</i> within a coding interval simultaneously.
* @return
*/
public static boolean isCodingExon(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) && ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
}
return false;
}
public void setTranscript_id(String transcript_id) {
this.transcript_id = transcript_id;
}
public void setStrand(int strand) {
this.strand = strand;
}
public void setTranscript_interval(GenomeLoc transcript_interval) {
this.transcript_interval = transcript_interval;
}
public void setTranscript_coding_interval(GenomeLoc transcript_coding_interval) {
this.transcript_coding_interval = transcript_coding_interval;
}
public void setExons(List<GenomeLoc> exons) {
this.exons = exons;
}
public void setGene_name(String gene_name) {
this.gene_name = gene_name;
}
public void setExon_frames(List<Integer> exon_frames) {
this.exon_frames = exon_frames;
}
public void setName(String name) {
this.name = name;
}
@Override
public String getChr() {
return contig;
}
@Override
public int getStart() {
return start;
}
@Override
public int getEnd() {
return stop;
}
}

View File

@ -35,7 +35,7 @@ public class VCF4Codec implements FeatureCodec {
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
// cache the genotyope values
private static String[] CachedGTValues = new String[100];
private static String[] GTValueArray = new String[100];
// for performance testing purposes
public static boolean validate = true;
@ -56,7 +56,7 @@ public class VCF4Codec implements FeatureCodec {
ArrayList<String> filterFields = new ArrayList<String>();
// do we want to validate the info, format, and filter fields
private final boolean validateFromHeader = true;
private final boolean validateFromHeader = false;
// we store a name to give to each of the variant contexts we emit
private String name = "Unkown";
@ -377,49 +377,72 @@ public class VCF4Codec implements FeatureCodec {
// do we have genotyping data
if (parts.length > 8) {
String GT = parts[8];
int genotypesStart = 9;
// parse genotypes
int nGTKeys = ParsingUtils.split(GT, genotypeKeyArray, ':');
genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
Iterator<String> iter = header.getGenotypeSamples().iterator();
alleleMap.clear();
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
String sample = parts[genotypeOffset];
String[] GTValues = CachedGTValues;
ParsingUtils.split(sample, GTValues, ':');
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
// todo -- the parsing of attributes could be made lazy for performance
Map<String, String> gtAttributes = null;
if (nGTKeys > 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for (int i = 1; i < nGTKeys; i++) {
if (genotypeKeyArray[i].equals("GQ")) {
GTQual = parseQual(GTValues[i]);
}
if (genotypeKeyArray[i].equals("FL")) { // deal with genotype filters here
genotypeFilters.addAll(parseFilters(GTValues[i]));
} else {
gtAttributes.put(genotypeKeyArray[i], GTValues[i]);
}
}
// validate the format fields
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
}
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart);
}
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
}
/**
* create a genotype map
* @param parts the string parts
* @param locAndAlleles the locations and the list of alleles
* @param genotypesStart the position in the parts array that the genotype strings start
* @return a mapping of sample name to genotype object
*/
private Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int genotypesStart) {
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
// get the format keys
int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':');
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
String sampleName = sampleNameIterator.next();
// todo -- the parsing of attributes could be made lazy for performance
Map<String, String> gtAttributes = null;
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
if (nGTKeys > 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for (int i = 1; i < nGTKeys; i++) {
if (i >= GTValueSplitSize)
gtAttributes.put(genotypeKeyArray[i],".");
else if (genotypeKeyArray[i].equals("GQ"))
GTQual = parseQual(GTValueArray[i]);
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
genotypeFilters.addAll(parseFilters(GTValueArray[i]));
else
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
}
// validate the format fields
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
}
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
return genotypes;
}
/**
* clip the alleles, based on the reference
* @param unclippedAlleles the list of alleles

View File

@ -1,252 +0,0 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Sep 22, 2009
* Time: 3:19:41 PM
* To change this template use File | Settings | File Templates.
*/
public class rodRefSeq extends BasicReferenceOrderedDatum implements Transcript {
private String transcript_id;
private int strand;
private GenomeLoc transcript_interval;
private GenomeLoc transcript_coding_interval;
private List<GenomeLoc> exons;
private String gene_name;
private List<Integer> exon_frames;
public rodRefSeq(String name) {
super(name);
}
/** Returns id of the transcript (RefSeq NM_* id) */
public String getTranscriptId() { return transcript_id; }
/** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
public int getStrand() { return strand; }
/** Returns transcript's full genomic interval (includes all exons with UTRs) */
public GenomeLoc getLocation() { return transcript_interval; }
/** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */
public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
/** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */
public String getGeneName() { return gene_name; }
/** Number of exons in this transcript */
public int getNumExons() { return exons.size(); }
/** Genomic location of the n-th exon; throws an exception if n is out of bounds */
public GenomeLoc getExonLocation(int n) {
if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
return exons.get(n);
}
/** Returns the list of all exons in this transcript, as genomic intervals */
public List<GenomeLoc> getExons() { return exons; }
/** Returns all exons falling ::entirely:: inside an interval **/
public List<GenomeLoc> getExonsInInterval( GenomeLoc interval ) {
List<GenomeLoc> relevantExons = new ArrayList<GenomeLoc>(exons.size());
for ( GenomeLoc exon : getExons() ) {
if ( interval.containsP(exon) ) {
relevantExons.add(exon);
}
}
return relevantExons;
}
/** convenience method; returns the numbers of the exons in the interval **/
public List<Integer> getExonNumbersInInterval( GenomeLoc interval ) {
List<Integer> numbers = new ArrayList<Integer>();
int iNo = 0;
for ( GenomeLoc exon : getExons() ) {
if ( interval.containsP(exon) ) {
numbers.add(iNo);
}
iNo++;
}
return numbers;
}
public String getTranscriptUniqueGeneName() {
return String.format("%s(%s)",getGeneName(),getTranscriptId());
}
public String getOverlapString(GenomeLoc position) {
boolean is_exon = false;
StringBuilder overlapString = new StringBuilder();
int exonNo = 1;
for ( GenomeLoc exon : exons ) {
if ( exon.containsP(position) ) {
overlapString.append(String.format("exon_%d",exonNo));
is_exon = true;
break;
}
exonNo ++;
}
if ( ! is_exon ) {
if ( overlapsCodingP(position) ) {
overlapString.append("Intron");
} else {
overlapString.append("UTR");
}
}
return overlapString.toString();
}
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
public boolean overlapsP (GenomeLoc that) {
return transcript_interval.overlapsP(that);
}
/** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
* NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
* but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
* @see #overlapsExonP
*/
public boolean overlapsCodingP (GenomeLoc that) {
return transcript_coding_interval.overlapsP(that);
}
/** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
public boolean overlapsExonP (GenomeLoc that) {
for ( GenomeLoc e : exons ) {
if ( e.overlapsP(that) ) return true;
}
return false;
}
/** Fills this object from a text line in RefSeq (UCSC) text dump file */
@Override
public boolean parseLine(final Object header, String[] fields) {
transcript_id = fields[1];
if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1;
else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1;
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
String contig_name = fields[2];
transcript_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
transcript_coding_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
gene_name = fields[12];
String[] exon_starts = fields[9].split(",");
String[] exon_stops = fields[10].split(",");
String[] eframes = fields[15].split(",");
assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
exons = new ArrayList<GenomeLoc>(exon_starts.length);
exon_frames = new ArrayList<Integer>(eframes.length);
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
exon_frames.add(Integer.decode(eframes[i]));
}
return true;
}
public String toString() {
StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something
b.append(transcript_id); // #1
b.append('\t');
b.append(transcript_interval.getContig()); // #2
b.append('\t');
b.append( (strand==1?'+':'-') ); // #3
b.append('\t');
b.append( (transcript_interval.getStart() - 1) ); // #4
b.append('\t');
b.append( transcript_interval.getStop()); // #5
b.append('\t');
b.append( (transcript_coding_interval.getStart() - 1) ); // #6
b.append('\t');
b.append( transcript_coding_interval.getStop()); // #7
b.append('\t');
b.append(exons.size()); // #8
b.append('\t');
for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9
b.append('\t');
for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10
b.append("\t0\t"); // # 11 - unused?
b.append(gene_name); // # 12
b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused?
for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15
return b.toString();
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin an exon of <i>any</i> of those transcripts. Passing null is safe (will return false).
* NOTE: position can be still within a UTR, see #isCoding
* @return true if it's an exon
*/
public static boolean isExon(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
}
return false;
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin a coding region of <i>any</i> of those transcripts.
* Passing null is safe (will return false).
* NOTE: "coding" interval is defined as a single genomic interval, so it
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
* transcript, or internal exons that are not spliced into a given transcript. To check that a position is
* indeed within an exon but not in UTR, use #isCodingExon().
* @return
*/
public static boolean isCoding(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true;
}
return false;
}
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
* this position is fully whithin a coding exon portion (i.e. true coding sequence) of <i>any</i> of those transcripts.
* Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript,
* for which the current position is within an exon <i>and</i> within a coding interval simultaneously.
* @return
*/
public static boolean isCodingExon(RODRecordList l) {
if ( l == null ) return false;
GenomeLoc loc = l.getLocation();
for ( GATKFeature t : l ) {
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) && ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
}
return false;
}
}

View File

@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder {
static {
// All known ROD types
Types.put("GELI", rodGELI.class);
Types.put("RefSeq", rodRefSeq.class);
Types.put("Table", TabularROD.class);
Types.put("HapMap", HapMapROD.class);
Types.put("Intervals", IntervalRod.class);

View File

@ -29,11 +29,18 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.dbsnp.DbSNPCodec;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
@ -46,6 +53,7 @@ import org.broadinstitute.sting.utils.collections.CircularArray;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.commandline.Argument;
import java.io.File;
import java.io.IOException;
import java.util.*;
@ -136,11 +144,15 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
normal_context = new WindowContext(0,WINDOW_SIZE);
if ( RefseqFileName != null ) {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
new java.io.File(RefseqFileName), rodRefSeq.class);
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
logger.info("Using RefSeq annotations from "+RefseqFileName);
try {
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq"));
} catch (IOException e) {
throw new StingException("Unable to open file " + RefseqFileName, e);
}
logger.info("Using RefSeq annotations from "+RefseqFileName);
}
if ( refseqIterator == null ) logger.info("No annotations available");
@ -627,11 +639,11 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
else {
StringBuilder b = new StringBuilder();
if ( rodRefSeq.isExon(ann) ) {
if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
if ( RefSeqFeature.isExon(ann) ) {
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
else b.append(annUTR); // exon but not coding = UTR
} else {
if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
}
b.append('\t');

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
@ -44,13 +45,26 @@ import java.util.LinkedList;
* Prints out counts of the number of reference ordered data objects are
* each locus for debugging RodWalkers.
*/
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
public boolean showSkipped = false;
@Override
public Pair<ExpandingArrayList<Long>, Long> treeReduce(Pair<ExpandingArrayList<Long>, Long> lhs, Pair<ExpandingArrayList<Long>, Long> rhs) {
ExpandingArrayList<Long> nt = new ExpandingArrayList<Long>();
nt.addAll(lhs.first);
int index = 0;
for (Long l : rhs.first)
if (nt.get(index) == null)
nt.add(l);
else
nt.set(index,nt.get(index) + l);
return new Pair<ExpandingArrayList<Long>, Long>(nt, lhs.second + rhs.second);
}
public class Datum {
public long nRodsAtThisLocation = 0;
public long nSkippedBases =0;

View File

@ -1,21 +1,15 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.bed.BEDFeature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.util.*;
@ -31,7 +25,7 @@ import java.util.*;
public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
private HashMap<GenomeLoc,IntervalInfoBuilder> intervalBuffer = new HashMap<GenomeLoc,IntervalInfoBuilder>();
private HashSet<rodRefSeq> refseqBuffer = new HashSet<rodRefSeq>();
private HashSet<RefSeqFeature> refseqBuffer = new HashSet<RefSeqFeature>();
private HashMap<String,BEDFeature> currentBedFeatures = new HashMap<String,BEDFeature>();
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -59,8 +53,8 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
if ( refseqList != null && refseqList.size() > 0 ) {
for ( Object seq : refseqList ) {
if ( ! refseqBuffer.contains( (rodRefSeq) seq) ) {
refseqBuffer.add( (rodRefSeq) seq );
if ( ! refseqBuffer.contains( (RefSeqFeature) seq) ) {
refseqBuffer.add( (RefSeqFeature) seq );
}
}
}
@ -84,7 +78,7 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
private long process() {
long nUpdate = 0l;
for ( GenomeLoc interval : intervalBuffer.keySet() ) {
for ( rodRefSeq refseq : refseqBuffer ) {
for ( RefSeqFeature refseq : refseqBuffer ) {
if ( interval.overlapsP(refseq.getLocation()) &&
! intervalBuffer.get(interval).geneNames.contains(refseq.getTranscriptUniqueGeneName()) ) {
// if the interval overlaps the gene transcript; and the gene is not already represented in the interval
@ -119,14 +113,14 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
* @return diddly
*/
public void cleanup(ReferenceContext ref) {
List<rodRefSeq> toRemove = new ArrayList<rodRefSeq>();
for ( rodRefSeq refseq : refseqBuffer ) {
List<RefSeqFeature> toRemove = new ArrayList<RefSeqFeature>();
for ( RefSeqFeature refseq : refseqBuffer ) {
if ( refseq.getLocation().isBefore(ref.getLocus()) ) {
toRemove.add(refseq);
}
}
for ( rodRefSeq refseq : toRemove ) {
for ( RefSeqFeature refseq : toRemove ) {
refseqBuffer.remove(refseq);
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
@ -8,6 +9,10 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
@ -15,6 +20,8 @@ import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
@ -42,11 +49,11 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
else {
StringBuilder b = new StringBuilder();
if ( rodRefSeq.isExon(ann) ) {
if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
if ( RefSeqFeature.isExon(ann) ) {
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
else b.append(annUTR); // exon but not coding = UTR
} else {
if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
}
b.append('\t');
@ -57,10 +64,15 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
public void initialize() {
if ( RefseqFileName != null ) {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
new java.io.File(RefseqFileName), rodRefSeq.class);
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
try {
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq"));
} catch (IOException e) {
throw new StingException("Unable to open file " + RefseqFileName, e);
}
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
logger.info("Using RefSeq annotations from "+RefseqFileName);
}

View File

@ -31,21 +31,22 @@ import net.sf.picard.util.IntervalList;
import net.sf.picard.util.OverlapDetector;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broad.tribble.FeatureReader;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.*;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
@ -108,10 +109,15 @@ public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerforma
@Override
public void initialize() {
if ( REFSEQ_FILE != null ) {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
new java.io.File(REFSEQ_FILE), rodRefSeq.class);
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class, new File(REFSEQ_FILE)).first;
try {
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(), "refseq"));
} catch (IOException e) {
throw new StingException("Unable to open file " + REFSEQ_FILE, e);
}
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
logger.info("Using RefSeq annotations from "+REFSEQ_FILE);
}
@ -282,8 +288,8 @@ public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerforma
if (annotationList == null) { return "UNKNOWN"; }
for(GATKFeature rec : annotationList) {
if ( ((rodRefSeq)rec.getUnderlyingObject()).overlapsExonP(target) ) {
return ((rodRefSeq)rec.getUnderlyingObject()).getGeneName();
if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) {
return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName();
}
}

View File

@ -173,9 +173,9 @@ public class VCF4UnitTest extends BaseTest {
}
// test too many info fields
// test too many info fields - NOT a valid test with validation turned off in the VCF4 reader
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test(expected=StingException.class)
//@Test(expected=StingException.class)
public void testCheckTooManyInfoFields() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoManyInfoLine);