A fix for the 'rod blows up when it hits a GenomeLoc outside the reference' issu

e.  Really a stopgap; error handling in the RODs needs to be addressed in a more comprehensive way.  Right now, hasNext() isn't guaranteed to be correct.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@878 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-06-02 18:14:46 +00:00
parent ad5b057140
commit 6e60cddfed
10 changed files with 128 additions and 22 deletions

View File

@ -79,7 +79,7 @@ public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum {
varFreq = Double.parseDouble(parts[11]); // CEU_var_freq
totalCounts = Integer.parseInt(parts[12]); // CEU_var
loc = new GenomeLoc(contig, start, stop);
loc = GenomeLoc.parseGenomeLoc(contig, start, stop);
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts));

View File

@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.MalformedGenomeLocException;
import org.apache.log4j.Logger;
/**
@ -263,11 +264,36 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
}
public ROD next() {
final String line = parser.next();
//System.out.printf("Line is %s%n", line);
String parts[] = line.split(fieldDelimiter);
ROD n = parseLine(parts);
return n != null ? n : next();
ROD n = null;
boolean success = false;
boolean firstFailure = true;
do {
final String line = parser.next();
//System.out.printf("Line is %s%n", line);
String parts[] = line.split(fieldDelimiter);
try {
n = parseLine(parts);
// Two failure conditions:
// 1) parseLine throws an exception.
// 2) parseLine returns null.
// 3) parseLine throws a RuntimeException.
// TODO: Clean this up so that all errors are handled in one spot.
success = (n != null);
}
catch( MalformedGenomeLocException ex ) {
if( firstFailure ) {
Utils.warnUser("Failed to parse contig on line '" + line + "'. Skipping ahead to the next recognized GenomeLoc.");
firstFailure = false;
}
if( !parser.hasNext() )
Utils.warnUser("Unable to find more valid reference-ordered data. Giving up.");
}
} while (!success && parser.hasNext());
return n;
}
public void remove() {
@ -308,7 +334,8 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
public boolean hasNext() { return it.hasNext(); }
public ROD next() {
ROD next = it.next();
position = next.getLocation().clone();
if( next != null )
position = next.getLocation().clone();
return next;
}
@ -336,6 +363,8 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
while ( hasNext() ) {
ROD current = next();
if( current == null )
continue;
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
int cmp = current.getLocation().compareTo(loc);
if ( cmp < 0 ) {

View File

@ -170,8 +170,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
if ( refBaseChar == '*' ) {
parseIndels(parts[3]) ;
if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
if ( varType == DELETION_VARIANT ) loc = GenomeLoc.parseGenomeLoc(contig, start, start+eventLength-1);
else loc = GenomeLoc.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
}
else {
parseBasesAndQuals(parts[8], parts[9]);
@ -180,8 +180,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
refBases = parts[2].toUpperCase();
eventLength = 1;
//loc = new GenomeLoc(contig, start, start+1);
loc = new GenomeLoc(contig, start, start);
//loc = GenomeLoc.parseGenomeLoc(contig, start, start+1);
loc = GenomeLoc.parseGenomeLoc(contig, start, start);
char ch = parts[3].charAt(0);

View File

@ -35,7 +35,7 @@ import org.apache.log4j.Logger;
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
* String rowData = String.format("%d %d %d", 1, 2, 3);
* TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
* TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" "));
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
*/
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {

View File

@ -75,8 +75,8 @@ public class Transcript {
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
String contig_name = fields[2];
transcript_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
transcript_coding_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
transcript_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
transcript_coding_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
gene_name = fields[12];
String[] exon_starts = fields[9].split(",");
String[] exon_stops = fields[10].split(",");
@ -89,7 +89,7 @@ public class Transcript {
exon_frames = new ArrayList<Integer>(eframes.length);
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
exons.add(new GenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
exons.add(GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
exon_frames.add(Integer.decode(eframes[i]));
}
}

View File

@ -7,6 +7,7 @@ import java.util.*;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.MalformedGenomeLocException;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
/**
@ -161,7 +162,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
String contig = parts[1];
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = new GenomeLoc(contig, start, stop);
loc = GenomeLoc.parseGenomeLoc(contig, start, stop);
name = parts[4];
refBases = parts[5];
@ -177,8 +178,11 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true;
} catch( MalformedGenomeLocException ex ) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw ex;
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts));
throw e;
}
}

View File

@ -73,7 +73,7 @@ public class rodGFF extends BasicReferenceOrderedDatum {
}
public GenomeLoc getLocation() {
return new GenomeLoc(contig, start, stop);
return GenomeLoc.parseGenomeLoc(contig, start, stop);
}
public String getAttribute(final String key) {

View File

@ -21,7 +21,7 @@ public class rodRefSeq extends BasicReferenceOrderedDatum {
public rodRefSeq(String name) {
super(name);
// location = new GenomeLoc(0,0,-1);
// location = GenomeLoc.parseGenomeLoc(0,0,-1);
}
/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
@ -182,7 +182,7 @@ class refSeqIterator implements Iterator<rodRefSeq> {
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
// of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list)
rodRefSeq rod = new rodRefSeq(name,new GenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
rodRefSeq rod = new rodRefSeq(name,GenomeLoc.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
// if ( (++z) % 1000000 == 0 ) {
// System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s");
// z = 0;

View File

@ -87,7 +87,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
public GenomeLoc( int contigIndex, final long start, final long stop ) {
if(contigInfo == null) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); }
if (contigIndex < 0 || contigIndex >= contigInfo.size()) {
if (!isSequenceIndexValid(contigIndex)) {
throw new StingException("Contig info has not been setup in the GenomeLoc context yet.");
}
if (start < 0) { throw new StingException("Bad start position " + start);}
@ -128,6 +128,19 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
return Long.parseLong(x);
}
/**
* Use this static constructor when the input data is under limited control (i.e. parsing user data).
* @param contig Contig to parse.
* @param start Starting point.
* @param stop Stop point.
* @return The genome location, or a MalformedGenomeLocException if unparseable.
*/
public static GenomeLoc parseGenomeLoc( final String contig, long start, long stop ) {
if( !isContigValid(contig) )
throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference.");
return new GenomeLoc(contig,start,stop);
}
public static GenomeLoc parseGenomeLoc( final String str ) {
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
@ -184,7 +197,10 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
stop = getContigInfo(contig).getSequenceLength();
}
GenomeLoc loc = new GenomeLoc(contig, start, stop);
if( !isContigValid(contig) )
throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference.");
GenomeLoc loc = parseGenomeLoc(contig,start,stop);
// System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
return loc;
@ -583,4 +599,24 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
}
}
}
/**
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
* @return True if the contig is valid. False otherwise.
*/
private static boolean isContigValid( String contig ) {
int contigIndex = contigInfo.getSequenceIndex(contig);
return isSequenceIndexValid(contigIndex);
}
/**
* Determines whether the given sequence index is valid with respect to the sequence dictionary.
* @param sequenceIndex sequence index
* @return True if the sequence index is valid, false otherwise.
*/
private static boolean isSequenceIndexValid( int sequenceIndex ) {
return sequenceIndex >= 0 && sequenceIndex < contigInfo.size();
}
}

View File

@ -0,0 +1,37 @@
package org.broadinstitute.sting.utils;
/**
* User: hanna
* Date: Jun 2, 2009
* Time: 11:43:48 AM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Indicates that something was wrong with in the parameters passed to create a GenomeLoc...
* bad sequence id out of bounds, etc.
*/
public class MalformedGenomeLocException extends StingException {
/**
* Create a new MalformedGenomeLocException with the given message. Does not preserve the existing stack trace.
* @param message The message.
*/
public MalformedGenomeLocException( String message ) {
super(message);
}
/**
* Create a new MalformedGenomeLocException with the given message and root cause.
* @param message The message.
* @param t The root cause.
*/
public MalformedGenomeLocException( String message, Throwable t ) {
super(message,t);
}
}