diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java index bed1ac766..510601fec 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java @@ -79,7 +79,7 @@ public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum { varFreq = Double.parseDouble(parts[11]); // CEU_var_freq totalCounts = Integer.parseInt(parts[12]); // CEU_var - loc = new GenomeLoc(contig, start, stop); + loc = GenomeLoc.parseGenomeLoc(contig, start, stop); } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts)); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 7aaaf0397..10a90684f 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.refdata.rodRefSeq; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.xReadLines; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.MalformedGenomeLocException; import org.apache.log4j.Logger; /** @@ -263,11 +264,36 @@ public class ReferenceOrderedData implements } public ROD next() { - final String line = parser.next(); - //System.out.printf("Line is %s%n", line); - String parts[] = line.split(fieldDelimiter); - ROD n = parseLine(parts); - return n != null ? n : next(); + ROD n = null; + boolean success = false; + boolean firstFailure = true; + + do { + final String line = parser.next(); + //System.out.printf("Line is %s%n", line); + String parts[] = line.split(fieldDelimiter); + + try { + n = parseLine(parts); + // Two failure conditions: + // 1) parseLine throws an exception. + // 2) parseLine returns null. + // 3) parseLine throws a RuntimeException. + // TODO: Clean this up so that all errors are handled in one spot. + success = (n != null); + } + catch( MalformedGenomeLocException ex ) { + if( firstFailure ) { + Utils.warnUser("Failed to parse contig on line '" + line + "'. Skipping ahead to the next recognized GenomeLoc."); + firstFailure = false; + } + if( !parser.hasNext() ) + Utils.warnUser("Unable to find more valid reference-ordered data. Giving up."); + } + + } while (!success && parser.hasNext()); + + return n; } public void remove() { @@ -308,7 +334,8 @@ public class ReferenceOrderedData implements public boolean hasNext() { return it.hasNext(); } public ROD next() { ROD next = it.next(); - position = next.getLocation().clone(); + if( next != null ) + position = next.getLocation().clone(); return next; } @@ -336,6 +363,8 @@ public class ReferenceOrderedData implements if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart()); while ( hasNext() ) { ROD current = next(); + if( current == null ) + continue; //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); int cmp = current.getLocation().compareTo(loc); if ( cmp < 0 ) { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java index f09b77637..350f3a6bf 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java @@ -170,8 +170,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup { if ( refBaseChar == '*' ) { parseIndels(parts[3]) ; - if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1); - else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! + if ( varType == DELETION_VARIANT ) loc = GenomeLoc.parseGenomeLoc(contig, start, start+eventLength-1); + else loc = GenomeLoc.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! } else { parseBasesAndQuals(parts[8], parts[9]); @@ -180,8 +180,8 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup { refBases = parts[2].toUpperCase(); eventLength = 1; - //loc = new GenomeLoc(contig, start, start+1); - loc = new GenomeLoc(contig, start, start); + //loc = GenomeLoc.parseGenomeLoc(contig, start, start+1); + loc = GenomeLoc.parseGenomeLoc(contig, start, start); char ch = parts[3].charAt(0); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java index fddc566c2..48a844ab9 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -35,7 +35,7 @@ import org.apache.log4j.Logger; * ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); * assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); * String rowData = String.format("%d %d %d", 1, 2, 3); - * TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" ")); + * TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" ")); * assertTrue(row.toString().equals("chrM:1\t1\t2\t3")); */ public class TabularROD extends BasicReferenceOrderedDatum implements Map { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java index 41d99f4e7..dcb200b4b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java @@ -75,8 +75,8 @@ public class Transcript { else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]); String contig_name = fields[2]; - transcript_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); - transcript_coding_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])); + transcript_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); + transcript_coding_interval = GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])); gene_name = fields[12]; String[] exon_starts = fields[9].split(","); String[] exon_stops = fields[10].split(","); @@ -89,7 +89,7 @@ public class Transcript { exon_frames = new ArrayList(eframes.length); for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(new GenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exons.add(GenomeLoc.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); exon_frames.add(Integer.decode(eframes[i])); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 57255a25e..1f8a30e57 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -7,6 +7,7 @@ import java.util.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.MalformedGenomeLocException; import org.broadinstitute.sting.gatk.refdata.AllelicVariant; /** @@ -161,7 +162,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based - loc = new GenomeLoc(contig, start, stop); + loc = GenomeLoc.parseGenomeLoc(contig, start, stop); name = parts[4]; refBases = parts[5]; @@ -177,8 +178,11 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria weight = Integer.parseInt(parts[17]); //System.out.printf("Parsed %s%n", toString()); return true; + } catch( MalformedGenomeLocException ex ) { + // Just rethrow malformed genome locs; the ROD system itself will deal with these. + throw ex; } catch ( RuntimeException e ) { - System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); + System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts)); throw e; } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java index 752c907bb..b8729aea7 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java @@ -73,7 +73,7 @@ public class rodGFF extends BasicReferenceOrderedDatum { } public GenomeLoc getLocation() { - return new GenomeLoc(contig, start, stop); + return GenomeLoc.parseGenomeLoc(contig, start, stop); } public String getAttribute(final String key) { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java index e05d8047f..f7101e89d 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java @@ -21,7 +21,7 @@ public class rodRefSeq extends BasicReferenceOrderedDatum { public rodRefSeq(String name) { super(name); -// location = new GenomeLoc(0,0,-1); +// location = GenomeLoc.parseGenomeLoc(0,0,-1); } /** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will @@ -182,7 +182,7 @@ class refSeqIterator implements Iterator { // 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list // of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list) - rodRefSeq rod = new rodRefSeq(name,new GenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records)); + rodRefSeq rod = new rodRefSeq(name,GenomeLoc.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records)); // if ( (++z) % 1000000 == 0 ) { // System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s"); // z = 0; diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 3f4e97b1e..2adede24d 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -87,7 +87,7 @@ public class GenomeLoc implements Comparable, Cloneable { public GenomeLoc( int contigIndex, final long start, final long stop ) { if(contigInfo == null) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); } - if (contigIndex < 0 || contigIndex >= contigInfo.size()) { + if (!isSequenceIndexValid(contigIndex)) { throw new StingException("Contig info has not been setup in the GenomeLoc context yet."); } if (start < 0) { throw new StingException("Bad start position " + start);} @@ -128,6 +128,19 @@ public class GenomeLoc implements Comparable, Cloneable { return Long.parseLong(x); } + /** + * Use this static constructor when the input data is under limited control (i.e. parsing user data). + * @param contig Contig to parse. + * @param start Starting point. + * @param stop Stop point. + * @return The genome location, or a MalformedGenomeLocException if unparseable. + */ + public static GenomeLoc parseGenomeLoc( final String contig, long start, long stop ) { + if( !isContigValid(contig) ) + throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference."); + return new GenomeLoc(contig,start,stop); + } + public static GenomeLoc parseGenomeLoc( final String str ) { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); @@ -184,7 +197,10 @@ public class GenomeLoc implements Comparable, Cloneable { stop = getContigInfo(contig).getSequenceLength(); } - GenomeLoc loc = new GenomeLoc(contig, start, stop); + if( !isContigValid(contig) ) + throw new MalformedGenomeLocException("Contig " + contig + " is not within installed in the GATK sequence dictionary derived from the reference."); + + GenomeLoc loc = parseGenomeLoc(contig,start,stop); // System.out.printf(" => Parsed location '%s' into %s%n", str, loc); return loc; @@ -583,4 +599,24 @@ public class GenomeLoc implements Comparable, Cloneable { } } } + + /** + * Determines whether the given contig is valid with respect to the sequence dictionary + * already installed in the GenomeLoc. + * @return True if the contig is valid. False otherwise. + */ + private static boolean isContigValid( String contig ) { + int contigIndex = contigInfo.getSequenceIndex(contig); + return isSequenceIndexValid(contigIndex); + } + + /** + * Determines whether the given sequence index is valid with respect to the sequence dictionary. + * @param sequenceIndex sequence index + * @return True if the sequence index is valid, false otherwise. + */ + private static boolean isSequenceIndexValid( int sequenceIndex ) { + return sequenceIndex >= 0 && sequenceIndex < contigInfo.size(); + + } } diff --git a/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java b/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java new file mode 100644 index 000000000..91c74f905 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting.utils; +/** + * User: hanna + * Date: Jun 2, 2009 + * Time: 11:43:48 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Indicates that something was wrong with in the parameters passed to create a GenomeLoc... + * bad sequence id out of bounds, etc. + */ + +public class MalformedGenomeLocException extends StingException { + /** + * Create a new MalformedGenomeLocException with the given message. Does not preserve the existing stack trace. + * @param message The message. + */ + public MalformedGenomeLocException( String message ) { + super(message); + } + + /** + * Create a new MalformedGenomeLocException with the given message and root cause. + * @param message The message. + * @param t The root cause. + */ + public MalformedGenomeLocException( String message, Throwable t ) { + super(message,t); + } +}