A fix for the 'rod blows up when it hits a GenomeLoc outside the reference' issu
e. Really a stopgap; error handling in the RODs needs to be addressed in a more comprehensive way. Right now, hasNext() isn't guaranteed to be correct. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@878 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ad5b057140
commit
6e60cddfed
|
|
@ -79,7 +79,7 @@ public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum {
|
||||||
varFreq = Double.parseDouble(parts[11]); // CEU_var_freq
|
varFreq = Double.parseDouble(parts[11]); // CEU_var_freq
|
||||||
totalCounts = Integer.parseInt(parts[12]); // CEU_var
|
totalCounts = Integer.parseInt(parts[12]); // CEU_var
|
||||||
|
|
||||||
loc = new GenomeLoc(contig, start, stop);
|
loc = GenomeLoc.parseGenomeLoc(contig, start, stop);
|
||||||
|
|
||||||
} catch ( RuntimeException e ) {
|
} catch ( RuntimeException e ) {
|
||||||
System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts));
|
System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts));
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.xReadLines;
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.MalformedGenomeLocException;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -263,11 +264,36 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
}
|
}
|
||||||
|
|
||||||
public ROD next() {
|
public ROD next() {
|
||||||
final String line = parser.next();
|
ROD n = null;
|
||||||
//System.out.printf("Line is %s%n", line);
|
boolean success = false;
|
||||||
String parts[] = line.split(fieldDelimiter);
|
boolean firstFailure = true;
|
||||||
ROD n = parseLine(parts);
|
|
||||||
return n != null ? n : next();
|
do {
|
||||||
|
final String line = parser.next();
|
||||||
|
//System.out.printf("Line is %s%n", line);
|
||||||
|
String parts[] = line.split(fieldDelimiter);
|
||||||
|
|
||||||
|
try {
|
||||||
|
n = parseLine(parts);
|
||||||
|
// Two failure conditions:
|
||||||
|
// 1) parseLine throws an exception.
|
||||||
|
// 2) parseLine returns null.
|
||||||
|
// 3) parseLine throws a RuntimeException.
|
||||||
|
// TODO: Clean this up so that all errors are handled in one spot.
|
||||||
|
success = (n != null);
|
||||||
|
}
|
||||||
|
catch( MalformedGenomeLocException ex ) {
|
||||||
|
if( firstFailure ) {
|
||||||
|
Utils.warnUser("Failed to parse contig on line '" + line + "'. Skipping ahead to the next recognized GenomeLoc.");
|
||||||
|
firstFailure = false;
|
||||||
|
}
|
||||||
|
if( !parser.hasNext() )
|
||||||
|
Utils.warnUser("Unable to find more valid reference-ordered data. Giving up.");
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (!success && parser.hasNext());
|
||||||
|
|
||||||
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void remove() {
|
public void remove() {
|
||||||
|
|
@ -308,7 +334,8 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
public boolean hasNext() { return it.hasNext(); }
|
public boolean hasNext() { return it.hasNext(); }
|
||||||
public ROD next() {
|
public ROD next() {
|
||||||
ROD next = it.next();
|
ROD next = it.next();
|
||||||
position = next.getLocation().clone();
|
if( next != null )
|
||||||
|
position = next.getLocation().clone();
|
||||||
return next;
|
return next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -336,6 +363,8 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
||||||
while ( hasNext() ) {
|
while ( hasNext() ) {
|
||||||
ROD current = next();
|
ROD current = next();
|
||||||
|
if( current == null )
|
||||||
|
continue;
|
||||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||||
int cmp = current.getLocation().compareTo(loc);
|
int cmp = current.getLocation().compareTo(loc);
|
||||||
if ( cmp < 0 ) {
|
if ( cmp < 0 ) {
|
||||||
|
|
|
||||||
|
|
@ -170,8 +170,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
|
||||||
if ( refBaseChar == '*' ) {
|
if ( refBaseChar == '*' ) {
|
||||||
|
|
||||||
parseIndels(parts[3]) ;
|
parseIndels(parts[3]) ;
|
||||||
if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
|
if ( varType == DELETION_VARIANT ) loc = GenomeLoc.parseGenomeLoc(contig, start, start+eventLength-1);
|
||||||
else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
|
else loc = GenomeLoc.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
parseBasesAndQuals(parts[8], parts[9]);
|
parseBasesAndQuals(parts[8], parts[9]);
|
||||||
|
|
@ -180,8 +180,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
|
||||||
|
|
||||||
refBases = parts[2].toUpperCase();
|
refBases = parts[2].toUpperCase();
|
||||||
eventLength = 1;
|
eventLength = 1;
|
||||||
//loc = new GenomeLoc(contig, start, start+1);
|
//loc = GenomeLoc.parseGenomeLoc(contig, start, start+1);
|
||||||
loc = new GenomeLoc(contig, start, start);
|
loc = GenomeLoc.parseGenomeLoc(contig, start, start);
|
||||||
|
|
||||||
char ch = parts[3].charAt(0);
|
char ch = parts[3].charAt(0);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@ import org.apache.log4j.Logger;
|
||||||
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||||
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||||
* String rowData = String.format("%d %d %d", 1, 2, 3);
|
* String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||||
* TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
|
* TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" "));
|
||||||
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
||||||
*/
|
*/
|
||||||
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
|
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
|
||||||
|
|
|
||||||
|
|
@ -75,8 +75,8 @@ public class Transcript {
|
||||||
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
|
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
|
||||||
|
|
||||||
String contig_name = fields[2];
|
String contig_name = fields[2];
|
||||||
transcript_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
transcript_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
||||||
transcript_coding_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
|
transcript_coding_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
|
||||||
gene_name = fields[12];
|
gene_name = fields[12];
|
||||||
String[] exon_starts = fields[9].split(",");
|
String[] exon_starts = fields[9].split(",");
|
||||||
String[] exon_stops = fields[10].split(",");
|
String[] exon_stops = fields[10].split(",");
|
||||||
|
|
@ -89,7 +89,7 @@ public class Transcript {
|
||||||
exon_frames = new ArrayList<Integer>(eframes.length);
|
exon_frames = new ArrayList<Integer>(eframes.length);
|
||||||
|
|
||||||
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
|
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
|
||||||
exons.add(new GenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
|
exons.add(GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
|
||||||
exon_frames.add(Integer.decode(eframes[i]));
|
exon_frames.add(Integer.decode(eframes[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import java.util.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.MalformedGenomeLocException;
|
||||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -161,7 +162,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
|
||||||
String contig = parts[1];
|
String contig = parts[1];
|
||||||
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||||
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||||
loc = new GenomeLoc(contig, start, stop);
|
loc = GenomeLoc.parseGenomeLoc(contig, start, stop);
|
||||||
|
|
||||||
name = parts[4];
|
name = parts[4];
|
||||||
refBases = parts[5];
|
refBases = parts[5];
|
||||||
|
|
@ -177,8 +178,11 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
|
||||||
weight = Integer.parseInt(parts[17]);
|
weight = Integer.parseInt(parts[17]);
|
||||||
//System.out.printf("Parsed %s%n", toString());
|
//System.out.printf("Parsed %s%n", toString());
|
||||||
return true;
|
return true;
|
||||||
|
} catch( MalformedGenomeLocException ex ) {
|
||||||
|
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
|
||||||
|
throw ex;
|
||||||
} catch ( RuntimeException e ) {
|
} catch ( RuntimeException e ) {
|
||||||
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts));
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,7 @@ public class rodGFF extends BasicReferenceOrderedDatum {
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenomeLoc getLocation() {
|
public GenomeLoc getLocation() {
|
||||||
return new GenomeLoc(contig, start, stop);
|
return GenomeLoc.parseGenomeLoc(contig, start, stop);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getAttribute(final String key) {
|
public String getAttribute(final String key) {
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ public class rodRefSeq extends BasicReferenceOrderedDatum {
|
||||||
|
|
||||||
public rodRefSeq(String name) {
|
public rodRefSeq(String name) {
|
||||||
super(name);
|
super(name);
|
||||||
// location = new GenomeLoc(0,0,-1);
|
// location = GenomeLoc.parseGenomeLoc(0,0,-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
|
/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
|
||||||
|
|
@ -182,7 +182,7 @@ class refSeqIterator implements Iterator<rodRefSeq> {
|
||||||
|
|
||||||
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
|
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
|
||||||
// of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list)
|
// of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list)
|
||||||
rodRefSeq rod = new rodRefSeq(name,new GenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
|
rodRefSeq rod = new rodRefSeq(name,GenomeLoc.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
|
||||||
// if ( (++z) % 1000000 == 0 ) {
|
// if ( (++z) % 1000000 == 0 ) {
|
||||||
// System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s");
|
// System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s");
|
||||||
// z = 0;
|
// z = 0;
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
public GenomeLoc( int contigIndex, final long start, final long stop ) {
|
public GenomeLoc( int contigIndex, final long start, final long stop ) {
|
||||||
if(contigInfo == null) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); }
|
if(contigInfo == null) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); }
|
||||||
|
|
||||||
if (contigIndex < 0 || contigIndex >= contigInfo.size()) {
|
if (!isSequenceIndexValid(contigIndex)) {
|
||||||
throw new StingException("Contig info has not been setup in the GenomeLoc context yet.");
|
throw new StingException("Contig info has not been setup in the GenomeLoc context yet.");
|
||||||
}
|
}
|
||||||
if (start < 0) { throw new StingException("Bad start position " + start);}
|
if (start < 0) { throw new StingException("Bad start position " + start);}
|
||||||
|
|
@ -128,6 +128,19 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
return Long.parseLong(x);
|
return Long.parseLong(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use this static constructor when the input data is under limited control (i.e. parsing user data).
|
||||||
|
* @param contig Contig to parse.
|
||||||
|
* @param start Starting point.
|
||||||
|
* @param stop Stop point.
|
||||||
|
* @return The genome location, or a MalformedGenomeLocException if unparseable.
|
||||||
|
*/
|
||||||
|
public static GenomeLoc parseGenomeLoc( final String contig, long start, long stop ) {
|
||||||
|
if( !isContigValid(contig) )
|
||||||
|
throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference.");
|
||||||
|
return new GenomeLoc(contig,start,stop);
|
||||||
|
}
|
||||||
|
|
||||||
public static GenomeLoc parseGenomeLoc( final String str ) {
|
public static GenomeLoc parseGenomeLoc( final String str ) {
|
||||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
//System.out.printf("Parsing location '%s'%n", str);
|
//System.out.printf("Parsing location '%s'%n", str);
|
||||||
|
|
@ -184,7 +197,10 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
stop = getContigInfo(contig).getSequenceLength();
|
stop = getContigInfo(contig).getSequenceLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
GenomeLoc loc = new GenomeLoc(contig, start, stop);
|
if( !isContigValid(contig) )
|
||||||
|
throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference.");
|
||||||
|
|
||||||
|
GenomeLoc loc = parseGenomeLoc(contig,start,stop);
|
||||||
// System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
|
// System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
|
||||||
|
|
||||||
return loc;
|
return loc;
|
||||||
|
|
@ -583,4 +599,24 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether the given contig is valid with respect to the sequence dictionary
|
||||||
|
* already installed in the GenomeLoc.
|
||||||
|
* @return True if the contig is valid. False otherwise.
|
||||||
|
*/
|
||||||
|
private static boolean isContigValid( String contig ) {
|
||||||
|
int contigIndex = contigInfo.getSequenceIndex(contig);
|
||||||
|
return isSequenceIndexValid(contigIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether the given sequence index is valid with respect to the sequence dictionary.
|
||||||
|
* @param sequenceIndex sequence index
|
||||||
|
* @return True if the sequence index is valid, false otherwise.
|
||||||
|
*/
|
||||||
|
private static boolean isSequenceIndexValid( int sequenceIndex ) {
|
||||||
|
return sequenceIndex >= 0 && sequenceIndex < contigInfo.size();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
/**
|
||||||
|
* User: hanna
|
||||||
|
* Date: Jun 2, 2009
|
||||||
|
* Time: 11:43:48 AM
|
||||||
|
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
|
||||||
|
* Software and documentation are copyright 2005 by the Broad Institute.
|
||||||
|
* All rights are reserved.
|
||||||
|
*
|
||||||
|
* Users acknowledge that this software is supplied without any warranty or support.
|
||||||
|
* The Broad Institute is not responsible for its use, misuse, or
|
||||||
|
* functionality.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates that something was wrong with in the parameters passed to create a GenomeLoc...
|
||||||
|
* bad sequence id out of bounds, etc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class MalformedGenomeLocException extends StingException {
|
||||||
|
/**
|
||||||
|
* Create a new MalformedGenomeLocException with the given message. Does not preserve the existing stack trace.
|
||||||
|
* @param message The message.
|
||||||
|
*/
|
||||||
|
public MalformedGenomeLocException( String message ) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new MalformedGenomeLocException with the given message and root cause.
|
||||||
|
* @param message The message.
|
||||||
|
* @param t The root cause.
|
||||||
|
*/
|
||||||
|
public MalformedGenomeLocException( String message, Throwable t ) {
|
||||||
|
super(message,t);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue