Two fixes together:
1) Some improvements to the VCF4 parsing, including disabling validation. 2) Reimplemented RefSeq in the new Tribble-style rod system. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3630 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
72e669538e
commit
682f9b46c6
|
|
@ -0,0 +1,80 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.refseq;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* the ref seq codec
|
||||
*/
|
||||
public class RefSeqCodec implements FeatureCodec {
|
||||
|
||||
@Override
|
||||
public Feature decodeLoc(String line) {
|
||||
String fields[] = line.split("\t");
|
||||
String contig_name = fields[2];
|
||||
return new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
||||
}
|
||||
|
||||
/** Fills this object from a text line in RefSeq (UCSC) text dump file */
|
||||
@Override
|
||||
public Feature decode(String line) {
|
||||
String fields[] = line.split("\t");
|
||||
|
||||
String contig_name = fields[2];
|
||||
RefSeqFeature feature = new RefSeqFeature(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
||||
|
||||
feature.setTranscript_id(fields[1]);
|
||||
if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1);
|
||||
else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1);
|
||||
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
|
||||
|
||||
|
||||
feature.setTranscript_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
|
||||
feature.setTranscript_coding_interval(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])));
|
||||
feature.setGene_name(fields[12]);
|
||||
String[] exon_starts = fields[9].split(",");
|
||||
String[] exon_stops = fields[10].split(",");
|
||||
String[] eframes = fields[15].split(",");
|
||||
|
||||
assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
|
||||
assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
|
||||
|
||||
ArrayList<GenomeLoc> exons = new ArrayList<GenomeLoc>(exon_starts.length);
|
||||
ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);
|
||||
|
||||
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
|
||||
exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
|
||||
exon_frames.add(Integer.decode(eframes[i]));
|
||||
}
|
||||
|
||||
feature.setExons(exons);
|
||||
feature.setExon_frames(exon_frames);
|
||||
return feature;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readHeader(LineReader reader) {
|
||||
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class getFeatureType() {
|
||||
return RefSeqCodec.class;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getHeader(Class clazz) throws ClassCastException {
|
||||
return null; // we don't have a header
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.refseq;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* the ref seq feature
|
||||
*/
|
||||
public class RefSeqFeature implements Transcript, Feature {
|
||||
|
||||
private String transcript_id;
|
||||
private int strand;
|
||||
private GenomeLoc transcript_interval;
|
||||
private GenomeLoc transcript_coding_interval;
|
||||
private List<GenomeLoc> exons;
|
||||
private String gene_name;
|
||||
private List<Integer> exon_frames;
|
||||
private String name;
|
||||
|
||||
// store the contig, start, and stop for this record
|
||||
private final String contig;
|
||||
private final int start;
|
||||
private final int stop;
|
||||
|
||||
public RefSeqFeature(String contig, int start, int stop) {
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
}
|
||||
|
||||
/** Returns id of the transcript (RefSeq NM_* id) */
|
||||
public String getTranscriptId() { return transcript_id; }
|
||||
|
||||
/** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
|
||||
public int getStrand() { return strand; }
|
||||
|
||||
/** Returns transcript's full genomic interval (includes all exons with UTRs) */
|
||||
public GenomeLoc getLocation() {
|
||||
if (transcript_interval == null)
|
||||
transcript_interval = GenomeLocParser.parseGenomeLoc(contig,start,stop);
|
||||
return transcript_interval;
|
||||
}
|
||||
|
||||
/** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */
|
||||
public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
|
||||
|
||||
/** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */
|
||||
public String getGeneName() { return gene_name; }
|
||||
|
||||
/** Number of exons in this transcript */
|
||||
public int getNumExons() { return exons.size(); }
|
||||
|
||||
/** Genomic location of the n-th exon; throws an exception if n is out of bounds */
|
||||
public GenomeLoc getExonLocation(int n) {
|
||||
if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
|
||||
return exons.get(n);
|
||||
}
|
||||
|
||||
/** Returns the list of all exons in this transcript, as genomic intervals */
|
||||
public List<GenomeLoc> getExons() { return exons; }
|
||||
|
||||
/** Returns all exons falling ::entirely:: inside an interval **/
|
||||
public List<GenomeLoc> getExonsInInterval( GenomeLoc interval ) {
|
||||
List<GenomeLoc> relevantExons = new ArrayList<GenomeLoc>(exons.size());
|
||||
for ( GenomeLoc exon : getExons() ) {
|
||||
if ( interval.containsP(exon) ) {
|
||||
relevantExons.add(exon);
|
||||
}
|
||||
}
|
||||
|
||||
return relevantExons;
|
||||
}
|
||||
|
||||
/** convenience method; returns the numbers of the exons in the interval **/
|
||||
public List<Integer> getExonNumbersInInterval( GenomeLoc interval ) {
|
||||
List<Integer> numbers = new ArrayList<Integer>();
|
||||
int iNo = 0;
|
||||
for ( GenomeLoc exon : getExons() ) {
|
||||
if ( interval.containsP(exon) ) {
|
||||
numbers.add(iNo);
|
||||
}
|
||||
iNo++;
|
||||
}
|
||||
|
||||
return numbers;
|
||||
}
|
||||
|
||||
public String getTranscriptUniqueGeneName() {
|
||||
return String.format("%s(%s)",getGeneName(),getTranscriptId());
|
||||
}
|
||||
|
||||
public String getOverlapString(GenomeLoc position) {
|
||||
boolean is_exon = false;
|
||||
StringBuilder overlapString = new StringBuilder();
|
||||
int exonNo = 1;
|
||||
|
||||
for ( GenomeLoc exon : exons ) {
|
||||
if ( exon.containsP(position) ) {
|
||||
overlapString.append(String.format("exon_%d",exonNo));
|
||||
is_exon = true;
|
||||
break;
|
||||
}
|
||||
exonNo ++;
|
||||
}
|
||||
|
||||
if ( ! is_exon ) {
|
||||
if ( overlapsCodingP(position) ) {
|
||||
overlapString.append("Intron");
|
||||
} else {
|
||||
overlapString.append("UTR");
|
||||
}
|
||||
}
|
||||
|
||||
return overlapString.toString();
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
|
||||
public boolean overlapsP (GenomeLoc that) {
|
||||
return getLocation().overlapsP(that);
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
|
||||
* NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
|
||||
* but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
|
||||
* @see #overlapsExonP
|
||||
*/
|
||||
public boolean overlapsCodingP (GenomeLoc that) {
|
||||
return transcript_coding_interval.overlapsP(that);
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
|
||||
public boolean overlapsExonP (GenomeLoc that) {
|
||||
for ( GenomeLoc e : exons ) {
|
||||
if ( e.overlapsP(that) ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something
|
||||
b.append(transcript_id); // #1
|
||||
b.append('\t');
|
||||
b.append(getLocation().getContig()); // #2
|
||||
b.append('\t');
|
||||
b.append( (strand==1?'+':'-') ); // #3
|
||||
b.append('\t');
|
||||
b.append( (getLocation().getStart() - 1) ); // #4
|
||||
b.append('\t');
|
||||
b.append( getLocation().getStop()); // #5
|
||||
b.append('\t');
|
||||
b.append( (transcript_coding_interval.getStart() - 1) ); // #6
|
||||
b.append('\t');
|
||||
b.append( transcript_coding_interval.getStop()); // #7
|
||||
b.append('\t');
|
||||
b.append(exons.size()); // #8
|
||||
b.append('\t');
|
||||
for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9
|
||||
b.append('\t');
|
||||
for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10
|
||||
b.append("\t0\t"); // # 11 - unused?
|
||||
b.append(gene_name); // # 12
|
||||
b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused?
|
||||
for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15
|
||||
|
||||
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin an exon of <i>any</i> of those transcripts. Passing null is safe (will return false).
|
||||
* NOTE: position can be still within a UTR, see #isCoding
|
||||
* @return true if it's an exon
|
||||
*/
|
||||
public static boolean isExon(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin a coding region of <i>any</i> of those transcripts.
|
||||
* Passing null is safe (will return false).
|
||||
* NOTE: "coding" interval is defined as a single genomic interval, so it
|
||||
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
|
||||
* transcript, or internal exons that are not spliced into a given transcript. To check that a position is
|
||||
* indeed within an exon but not in UTR, use #isCodingExon().
|
||||
* @return
|
||||
*/
|
||||
public static boolean isCoding(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin a coding exon portion (i.e. true coding sequence) of <i>any</i> of those transcripts.
|
||||
* Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript,
|
||||
* for which the current position is within an exon <i>and</i> within a coding interval simultaneously.
|
||||
* @return
|
||||
*/
|
||||
public static boolean isCodingExon(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) && ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void setTranscript_id(String transcript_id) {
|
||||
this.transcript_id = transcript_id;
|
||||
}
|
||||
|
||||
public void setStrand(int strand) {
|
||||
this.strand = strand;
|
||||
}
|
||||
|
||||
public void setTranscript_interval(GenomeLoc transcript_interval) {
|
||||
this.transcript_interval = transcript_interval;
|
||||
}
|
||||
|
||||
public void setTranscript_coding_interval(GenomeLoc transcript_coding_interval) {
|
||||
this.transcript_coding_interval = transcript_coding_interval;
|
||||
}
|
||||
|
||||
public void setExons(List<GenomeLoc> exons) {
|
||||
this.exons = exons;
|
||||
}
|
||||
|
||||
public void setGene_name(String gene_name) {
|
||||
this.gene_name = gene_name;
|
||||
}
|
||||
|
||||
public void setExon_frames(List<Integer> exon_frames) {
|
||||
this.exon_frames = exon_frames;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getChr() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEnd() {
|
||||
return stop;
|
||||
}
|
||||
}
|
||||
|
|
@ -35,7 +35,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
// cache the genotyope values
|
||||
private static String[] CachedGTValues = new String[100];
|
||||
private static String[] GTValueArray = new String[100];
|
||||
|
||||
// for performance testing purposes
|
||||
public static boolean validate = true;
|
||||
|
|
@ -56,7 +56,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
ArrayList<String> filterFields = new ArrayList<String>();
|
||||
|
||||
// do we want to validate the info, format, and filter fields
|
||||
private final boolean validateFromHeader = true;
|
||||
private final boolean validateFromHeader = false;
|
||||
|
||||
// we store a name to give to each of the variant contexts we emit
|
||||
private String name = "Unkown";
|
||||
|
|
@ -377,49 +377,72 @@ public class VCF4Codec implements FeatureCodec {
|
|||
|
||||
// do we have genotyping data
|
||||
if (parts.length > 8) {
|
||||
String GT = parts[8];
|
||||
int genotypesStart = 9;
|
||||
// parse genotypes
|
||||
int nGTKeys = ParsingUtils.split(GT, genotypeKeyArray, ':');
|
||||
genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
||||
Iterator<String> iter = header.getGenotypeSamples().iterator();
|
||||
|
||||
alleleMap.clear();
|
||||
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
|
||||
String sample = parts[genotypeOffset];
|
||||
String[] GTValues = CachedGTValues;
|
||||
ParsingUtils.split(sample, GTValues, ':');
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap);
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
|
||||
// todo -- the parsing of attributes could be made lazy for performance
|
||||
Map<String, String> gtAttributes = null;
|
||||
if (nGTKeys > 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
for (int i = 1; i < nGTKeys; i++) {
|
||||
if (genotypeKeyArray[i].equals("GQ")) {
|
||||
GTQual = parseQual(GTValues[i]);
|
||||
}
|
||||
if (genotypeKeyArray[i].equals("FL")) { // deal with genotype filters here
|
||||
genotypeFilters.addAll(parseFilters(GTValues[i]));
|
||||
} else {
|
||||
gtAttributes.put(genotypeKeyArray[i], GTValues[i]);
|
||||
}
|
||||
}
|
||||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
}
|
||||
|
||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
||||
|
||||
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart);
|
||||
}
|
||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param parts the string parts
|
||||
* @param locAndAlleles the locations and the list of alleles
|
||||
* @param genotypesStart the position in the parts array that the genotype strings start
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
private Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int genotypesStart) {
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':');
|
||||
|
||||
// cycle through the sample names
|
||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||
|
||||
// clear out our allele mapping
|
||||
alleleMap.clear();
|
||||
|
||||
// cycle through the genotype strings
|
||||
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
|
||||
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap);
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
||||
|
||||
// todo -- the parsing of attributes could be made lazy for performance
|
||||
Map<String, String> gtAttributes = null;
|
||||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||
|
||||
if (nGTKeys > 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
for (int i = 1; i < nGTKeys; i++) {
|
||||
if (i >= GTValueSplitSize)
|
||||
gtAttributes.put(genotypeKeyArray[i],".");
|
||||
else if (genotypeKeyArray[i].equals("GQ"))
|
||||
GTQual = parseQual(GTValueArray[i]);
|
||||
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
|
||||
genotypeFilters.addAll(parseFilters(GTValueArray[i]));
|
||||
else
|
||||
gtAttributes.put(genotypeKeyArray[i], GTValueArray[i]);
|
||||
|
||||
}
|
||||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
}
|
||||
|
||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
||||
|
||||
Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* clip the alleles, based on the reference
|
||||
* @param unclippedAlleles the list of alleles
|
||||
|
|
|
|||
|
|
@ -1,252 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Sep 22, 2009
|
||||
* Time: 3:19:41 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodRefSeq extends BasicReferenceOrderedDatum implements Transcript {
|
||||
|
||||
private String transcript_id;
|
||||
private int strand;
|
||||
private GenomeLoc transcript_interval;
|
||||
private GenomeLoc transcript_coding_interval;
|
||||
private List<GenomeLoc> exons;
|
||||
private String gene_name;
|
||||
private List<Integer> exon_frames;
|
||||
|
||||
|
||||
public rodRefSeq(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
/** Returns id of the transcript (RefSeq NM_* id) */
|
||||
public String getTranscriptId() { return transcript_id; }
|
||||
/** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
|
||||
public int getStrand() { return strand; }
|
||||
/** Returns transcript's full genomic interval (includes all exons with UTRs) */
|
||||
public GenomeLoc getLocation() { return transcript_interval; }
|
||||
/** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */
|
||||
public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
|
||||
/** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */
|
||||
public String getGeneName() { return gene_name; }
|
||||
/** Number of exons in this transcript */
|
||||
public int getNumExons() { return exons.size(); }
|
||||
/** Genomic location of the n-th exon; throws an exception if n is out of bounds */
|
||||
public GenomeLoc getExonLocation(int n) {
|
||||
if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
|
||||
return exons.get(n);
|
||||
}
|
||||
/** Returns the list of all exons in this transcript, as genomic intervals */
|
||||
public List<GenomeLoc> getExons() { return exons; }
|
||||
|
||||
/** Returns all exons falling ::entirely:: inside an interval **/
|
||||
public List<GenomeLoc> getExonsInInterval( GenomeLoc interval ) {
|
||||
List<GenomeLoc> relevantExons = new ArrayList<GenomeLoc>(exons.size());
|
||||
for ( GenomeLoc exon : getExons() ) {
|
||||
if ( interval.containsP(exon) ) {
|
||||
relevantExons.add(exon);
|
||||
}
|
||||
}
|
||||
|
||||
return relevantExons;
|
||||
}
|
||||
|
||||
/** convenience method; returns the numbers of the exons in the interval **/
|
||||
public List<Integer> getExonNumbersInInterval( GenomeLoc interval ) {
|
||||
List<Integer> numbers = new ArrayList<Integer>();
|
||||
int iNo = 0;
|
||||
for ( GenomeLoc exon : getExons() ) {
|
||||
if ( interval.containsP(exon) ) {
|
||||
numbers.add(iNo);
|
||||
}
|
||||
iNo++;
|
||||
}
|
||||
|
||||
return numbers;
|
||||
}
|
||||
|
||||
public String getTranscriptUniqueGeneName() {
|
||||
return String.format("%s(%s)",getGeneName(),getTranscriptId());
|
||||
}
|
||||
|
||||
public String getOverlapString(GenomeLoc position) {
|
||||
boolean is_exon = false;
|
||||
StringBuilder overlapString = new StringBuilder();
|
||||
int exonNo = 1;
|
||||
|
||||
for ( GenomeLoc exon : exons ) {
|
||||
if ( exon.containsP(position) ) {
|
||||
overlapString.append(String.format("exon_%d",exonNo));
|
||||
is_exon = true;
|
||||
break;
|
||||
}
|
||||
exonNo ++;
|
||||
}
|
||||
|
||||
if ( ! is_exon ) {
|
||||
if ( overlapsCodingP(position) ) {
|
||||
overlapString.append("Intron");
|
||||
} else {
|
||||
overlapString.append("UTR");
|
||||
}
|
||||
}
|
||||
|
||||
return overlapString.toString();
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
|
||||
public boolean overlapsP (GenomeLoc that) {
|
||||
return transcript_interval.overlapsP(that);
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
|
||||
* NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
|
||||
* but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
|
||||
* @see #overlapsExonP
|
||||
*/
|
||||
public boolean overlapsCodingP (GenomeLoc that) {
|
||||
return transcript_coding_interval.overlapsP(that);
|
||||
}
|
||||
|
||||
/** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
|
||||
public boolean overlapsExonP (GenomeLoc that) {
|
||||
for ( GenomeLoc e : exons ) {
|
||||
if ( e.overlapsP(that) ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Fills this object from a text line in RefSeq (UCSC) text dump file */
|
||||
@Override
|
||||
public boolean parseLine(final Object header, String[] fields) {
|
||||
transcript_id = fields[1];
|
||||
if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1;
|
||||
else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1;
|
||||
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
|
||||
|
||||
String contig_name = fields[2];
|
||||
transcript_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
||||
transcript_coding_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
|
||||
gene_name = fields[12];
|
||||
String[] exon_starts = fields[9].split(",");
|
||||
String[] exon_stops = fields[10].split(",");
|
||||
String[] eframes = fields[15].split(",");
|
||||
|
||||
assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
|
||||
assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
|
||||
|
||||
exons = new ArrayList<GenomeLoc>(exon_starts.length);
|
||||
exon_frames = new ArrayList<Integer>(eframes.length);
|
||||
|
||||
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
|
||||
exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
|
||||
exon_frames.add(Integer.decode(eframes[i]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something
|
||||
b.append(transcript_id); // #1
|
||||
b.append('\t');
|
||||
b.append(transcript_interval.getContig()); // #2
|
||||
b.append('\t');
|
||||
b.append( (strand==1?'+':'-') ); // #3
|
||||
b.append('\t');
|
||||
b.append( (transcript_interval.getStart() - 1) ); // #4
|
||||
b.append('\t');
|
||||
b.append( transcript_interval.getStop()); // #5
|
||||
b.append('\t');
|
||||
b.append( (transcript_coding_interval.getStart() - 1) ); // #6
|
||||
b.append('\t');
|
||||
b.append( transcript_coding_interval.getStop()); // #7
|
||||
b.append('\t');
|
||||
b.append(exons.size()); // #8
|
||||
b.append('\t');
|
||||
for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9
|
||||
b.append('\t');
|
||||
for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10
|
||||
b.append("\t0\t"); // # 11 - unused?
|
||||
b.append(gene_name); // # 12
|
||||
b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused?
|
||||
for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15
|
||||
|
||||
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin an exon of <i>any</i> of those transcripts. Passing null is safe (will return false).
|
||||
* NOTE: position can be still within a UTR, see #isCoding
|
||||
* @return true if it's an exon
|
||||
*/
|
||||
public static boolean isExon(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin a coding region of <i>any</i> of those transcripts.
|
||||
* Passing null is safe (will return false).
|
||||
* NOTE: "coding" interval is defined as a single genomic interval, so it
|
||||
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
|
||||
* transcript, or internal exons that are not spliced into a given transcript. To check that a position is
|
||||
* indeed within an exon but not in UTR, use #isCodingExon().
|
||||
* @return
|
||||
*/
|
||||
public static boolean isCoding(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to
|
||||
* rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether
|
||||
* this position is fully whithin a coding exon portion (i.e. true coding sequence) of <i>any</i> of those transcripts.
|
||||
* Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript,
|
||||
* for which the current position is within an exon <i>and</i> within a coding interval simultaneously.
|
||||
* @return
|
||||
*/
|
||||
public static boolean isCodingExon(RODRecordList l) {
|
||||
|
||||
if ( l == null ) return false;
|
||||
|
||||
GenomeLoc loc = l.getLocation();
|
||||
|
||||
for ( GATKFeature t : l ) {
|
||||
if ( ((rodRefSeq)t.getUnderlyingObject()).overlapsCodingP(loc) && ((rodRefSeq)t.getUnderlyingObject()).overlapsExonP(loc) ) return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder {
|
|||
static {
|
||||
// All known ROD types
|
||||
Types.put("GELI", rodGELI.class);
|
||||
Types.put("RefSeq", rodRefSeq.class);
|
||||
Types.put("Table", TabularROD.class);
|
||||
Types.put("HapMap", HapMapROD.class);
|
||||
Types.put("Intervals", IntervalRod.class);
|
||||
|
|
|
|||
|
|
@ -29,11 +29,18 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broad.tribble.dbsnp.DbSNPCodec;
|
||||
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
||||
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
|
|
@ -46,6 +53,7 @@ import org.broadinstitute.sting.utils.collections.CircularArray;
|
|||
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -136,11 +144,15 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
|||
normal_context = new WindowContext(0,WINDOW_SIZE);
|
||||
|
||||
if ( RefseqFileName != null ) {
|
||||
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
|
||||
new java.io.File(RefseqFileName), rodRefSeq.class);
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
|
||||
|
||||
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
|
||||
logger.info("Using RefSeq annotations from "+RefseqFileName);
|
||||
try {
|
||||
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq"));
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to open file " + RefseqFileName, e);
|
||||
}
|
||||
logger.info("Using RefSeq annotations from "+RefseqFileName);
|
||||
}
|
||||
|
||||
if ( refseqIterator == null ) logger.info("No annotations available");
|
||||
|
|
@ -627,11 +639,11 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
|||
else {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
||||
if ( rodRefSeq.isExon(ann) ) {
|
||||
if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
if ( RefSeqFeature.isExon(ann) ) {
|
||||
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
else b.append(annUTR); // exon but not coding = UTR
|
||||
} else {
|
||||
if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
|
||||
}
|
||||
b.append('\t');
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -44,13 +45,26 @@ import java.util.LinkedList;
|
|||
* Prints out counts of the number of reference ordered data objects are
|
||||
* each locus for debugging RodWalkers.
|
||||
*/
|
||||
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
|
||||
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
|
||||
public boolean verbose = false;
|
||||
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
|
||||
public boolean showSkipped = false;
|
||||
|
||||
@Override
|
||||
public Pair<ExpandingArrayList<Long>, Long> treeReduce(Pair<ExpandingArrayList<Long>, Long> lhs, Pair<ExpandingArrayList<Long>, Long> rhs) {
|
||||
ExpandingArrayList<Long> nt = new ExpandingArrayList<Long>();
|
||||
nt.addAll(lhs.first);
|
||||
int index = 0;
|
||||
for (Long l : rhs.first)
|
||||
if (nt.get(index) == null)
|
||||
nt.add(l);
|
||||
else
|
||||
nt.set(index,nt.get(index) + l);
|
||||
return new Pair<ExpandingArrayList<Long>, Long>(nt, lhs.second + rhs.second);
|
||||
}
|
||||
|
||||
public class Datum {
|
||||
public long nRodsAtThisLocation = 0;
|
||||
public long nSkippedBases =0;
|
||||
|
|
|
|||
|
|
@ -1,21 +1,15 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.bed.BEDFeature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -31,7 +25,7 @@ import java.util.*;
|
|||
public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
|
||||
|
||||
private HashMap<GenomeLoc,IntervalInfoBuilder> intervalBuffer = new HashMap<GenomeLoc,IntervalInfoBuilder>();
|
||||
private HashSet<rodRefSeq> refseqBuffer = new HashSet<rodRefSeq>();
|
||||
private HashSet<RefSeqFeature> refseqBuffer = new HashSet<RefSeqFeature>();
|
||||
private HashMap<String,BEDFeature> currentBedFeatures = new HashMap<String,BEDFeature>();
|
||||
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -59,8 +53,8 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
|
|||
|
||||
if ( refseqList != null && refseqList.size() > 0 ) {
|
||||
for ( Object seq : refseqList ) {
|
||||
if ( ! refseqBuffer.contains( (rodRefSeq) seq) ) {
|
||||
refseqBuffer.add( (rodRefSeq) seq );
|
||||
if ( ! refseqBuffer.contains( (RefSeqFeature) seq) ) {
|
||||
refseqBuffer.add( (RefSeqFeature) seq );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -84,7 +78,7 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
|
|||
private long process() {
|
||||
long nUpdate = 0l;
|
||||
for ( GenomeLoc interval : intervalBuffer.keySet() ) {
|
||||
for ( rodRefSeq refseq : refseqBuffer ) {
|
||||
for ( RefSeqFeature refseq : refseqBuffer ) {
|
||||
if ( interval.overlapsP(refseq.getLocation()) &&
|
||||
! intervalBuffer.get(interval).geneNames.contains(refseq.getTranscriptUniqueGeneName()) ) {
|
||||
// if the interval overlaps the gene transcript; and the gene is not already represented in the interval
|
||||
|
|
@ -119,14 +113,14 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
|
|||
* @return diddly
|
||||
*/
|
||||
public void cleanup(ReferenceContext ref) {
|
||||
List<rodRefSeq> toRemove = new ArrayList<rodRefSeq>();
|
||||
for ( rodRefSeq refseq : refseqBuffer ) {
|
||||
List<RefSeqFeature> toRemove = new ArrayList<RefSeqFeature>();
|
||||
for ( RefSeqFeature refseq : refseqBuffer ) {
|
||||
if ( refseq.getLocation().isBefore(ref.getLocus()) ) {
|
||||
toRemove.add(refseq);
|
||||
}
|
||||
}
|
||||
|
||||
for ( rodRefSeq refseq : toRemove ) {
|
||||
for ( RefSeqFeature refseq : toRemove ) {
|
||||
refseqBuffer.remove(refseq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -8,6 +9,10 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
|
|
@ -15,6 +20,8 @@ import org.broadinstitute.sting.utils.SampleUtils;
|
|||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -42,11 +49,11 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
|
|||
else {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
||||
if ( rodRefSeq.isExon(ann) ) {
|
||||
if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
if ( RefSeqFeature.isExon(ann) ) {
|
||||
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
else b.append(annUTR); // exon but not coding = UTR
|
||||
} else {
|
||||
if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
|
||||
}
|
||||
b.append('\t');
|
||||
|
|
@ -57,10 +64,15 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
|
|||
|
||||
public void initialize() {
|
||||
if ( RefseqFileName != null ) {
|
||||
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
|
||||
new java.io.File(RefseqFileName), rodRefSeq.class);
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
|
||||
|
||||
try {
|
||||
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq"));
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to open file " + RefseqFileName, e);
|
||||
}
|
||||
|
||||
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
|
||||
logger.info("Using RefSeq annotations from "+RefseqFileName);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,21 +31,22 @@ import net.sf.picard.util.IntervalList;
|
|||
import net.sf.picard.util.OverlapDetector;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
|
@ -108,10 +109,15 @@ public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerforma
|
|||
@Override
|
||||
public void initialize() {
|
||||
if ( REFSEQ_FILE != null ) {
|
||||
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
|
||||
new java.io.File(REFSEQ_FILE), rodRefSeq.class);
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
FeatureReader refseq = builder.createFeatureReader(RefSeqCodec.class, new File(REFSEQ_FILE)).first;
|
||||
|
||||
try {
|
||||
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(), "refseq"));
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to open file " + REFSEQ_FILE, e);
|
||||
}
|
||||
|
||||
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
|
||||
logger.info("Using RefSeq annotations from "+REFSEQ_FILE);
|
||||
}
|
||||
|
||||
|
|
@ -282,8 +288,8 @@ public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerforma
|
|||
if (annotationList == null) { return "UNKNOWN"; }
|
||||
|
||||
for(GATKFeature rec : annotationList) {
|
||||
if ( ((rodRefSeq)rec.getUnderlyingObject()).overlapsExonP(target) ) {
|
||||
return ((rodRefSeq)rec.getUnderlyingObject()).getGeneName();
|
||||
if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) {
|
||||
return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -173,9 +173,9 @@ public class VCF4UnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
|
||||
// test too many info fields
|
||||
// test too many info fields - NOT a valid test with validation turned off in the VCF4 reader
|
||||
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
@Test(expected=StingException.class)
|
||||
//@Test(expected=StingException.class)
|
||||
public void testCheckTooManyInfoFields() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
testSetup.codec.decode(twoManyInfoLine);
|
||||
|
|
|
|||
Loading…
Reference in New Issue