From 653f70efa2d7732d465125242c2911148352f784 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 16 Feb 2010 20:35:35 +0000 Subject: [PATCH] added methods to validate an interval before you try to make a GenomeLoc: boolean validGenomeLoc(). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2846 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/GenomeLocParser.java | 97 +++++++++++++++---- .../sting/utils/GenomeLocParserTest.java | 25 ++++- 2 files changed, 98 insertions(+), 24 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 9df8f8da1..7dead9ed1 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -89,11 +89,12 @@ public class GenomeLocParser { * Returns the contig index of a specified string version of the contig * * @param contig the contig string + * @param exceptionOut in some cases we don't want to exception out if the contig isn't valid * * @return the contig index, -1 if not found */ - public static int getContigIndex(final String contig) { - if (contigInfo.getSequenceIndex(contig) == -1) + public static int getContigIndex(final String contig, boolean exceptionOut) { + if (contigInfo.getSequenceIndex(contig) == -1 && exceptionOut) Utils.scareUser(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return contigInfo.getSequenceIndex(contig); @@ -300,7 +301,7 @@ public class GenomeLocParser { public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) { if (!isContigValid(contig)) throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference."); - return new GenomeLoc(contig, getContigIndex(contig), start, stop); + return new GenomeLoc(contig, getContigIndex(contig,true), start, stop); } @@ -339,7 +340,7 @@ public class GenomeLocParser { // iterate through the list of merged intervals and add then as GenomeLocs ret = new ArrayList(); for (Interval interval : il.getUniqueIntervals()) { - ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence()), interval.getStart(), interval.getEnd())); + ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd())); } return ret; @@ -351,7 +352,7 @@ public class GenomeLocParser { String locStr = Utils.join(";", lines); ret = parseGenomeLocs(locStr, rule); for(GenomeLoc locus: ret) - verifyGenomeLocBounds(locus); + exceptionOnInvalidGenomeLocBounds(locus); return ret; } catch (Exception e2) { logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage()); @@ -383,7 +384,7 @@ public class GenomeLocParser { */ public static GenomeLoc createGenomeLoc(String contig, final long start, final long stop) { checkSetup(); - return verifyGenomeLoc(new GenomeLoc(contig, GenomeLocParser.getContigIndex(contig), start, stop)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, GenomeLocParser.getContigIndex(contig,true), start, stop)); } /** @@ -417,7 +418,7 @@ public class GenomeLocParser { */ public static GenomeLoc createGenomeLoc(final SAMRecord read) { checkSetup(); - return verifyGenomeLoc(new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd())); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd())); } @@ -431,7 +432,7 @@ public class GenomeLocParser { */ public static GenomeLoc createGenomeLoc(final int contig, final long pos) { checkSetup(); - return verifyGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contig), contig, pos, pos)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contig), contig, pos, pos)); } /** @@ -444,12 +445,12 @@ public class GenomeLocParser { */ public static GenomeLoc createGenomeLoc(final String contig, final long pos) { checkSetup(); - return verifyGenomeLoc(new GenomeLoc(contig, GenomeLocParser.getContigIndex(contig), pos, pos)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, GenomeLocParser.getContigIndex(contig,true), pos, pos)); } public static GenomeLoc createGenomeLoc(final GenomeLoc toCopy) { checkSetup(); - return verifyGenomeLoc(new GenomeLoc(toCopy.getContig(), toCopy.getContigIndex(), toCopy.getStart(), toCopy.getStop())); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(toCopy.getContig(), toCopy.getContigIndex(), toCopy.getStart(), toCopy.getStop())); } /** @@ -460,7 +461,7 @@ public class GenomeLocParser { * * @return the genome loc if it's valid, otherwise we throw an exception */ - private static GenomeLoc verifyGenomeLoc(GenomeLoc toReturn) { + private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) { if (toReturn.getStart() < 0) { throw new StingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0"); } @@ -482,8 +483,8 @@ public class GenomeLocParser { * Verify the locus against the bounds of the contig. * @param locus Locus to verify. */ - private static void verifyGenomeLocBounds(GenomeLoc locus) { - verifyGenomeLoc(locus); + private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) { + exceptionOnInvalidGenomeLoc(locus); int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength(); if(locus.getStart() > contigSize) @@ -492,6 +493,60 @@ public class GenomeLocParser { throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig())); } + /** + * a method for validating genome locs as valid + * + * @param loc the location to validate + * + * @return true if the passed in GenomeLoc represents a valid location + */ + public static boolean validGenomeLoc(GenomeLoc loc) { + checkSetup(); + // quick check before we get the contig size, is the contig number valid + if ((loc.getContigIndex() < 0) || // the contig index has to be positive + (loc.getContigIndex() >= contigInfo.getSequences().size())) // the contig must be in the integer range of contigs) + return false; + + int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength(); + if ((loc.getStart() < 0) || // start must be greater than 0 + ((loc.getStop() != -1) && (loc.getStop() < 0)) || // the stop can be -1, but no other neg number + (loc.getStart() > contigSize) || // the start must be before or equal to the contig end + (loc.getStop() > contigSize)) // the stop must also be before or equal to the contig end + return false; + + // we passed + return true; + } + + /** + * validate a position or interval on the genome as valid + * + * @param contig the contig name + * @param start the start position + * @param stop the stop position + * + * @return true if it's valid, false otherwise + */ + public static boolean validGenomeLoc(String contig, long start, long stop) { + checkSetup(); + return validGenomeLoc(new GenomeLoc(contig, GenomeLocParser.getContigIndex(contig, false), start, stop)); + + } + + /** + * validate a position or interval on the genome as valid + * + * @param contigIndex the contig name + * @param start the start position + * @param stop the stop position + * + * @return true if it's valid, false otherwise + */ + public static boolean validGenomeLoc(int contigIndex, long start, long stop) { + checkSetup(); + if (contigIndex < 0 || contigIndex >= contigInfo.size()) return false; + return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop)); + } /** * Move this Genome loc to the next contig, with a start @@ -504,7 +559,7 @@ public class GenomeLocParser { if (current.getContigIndex() + 1 >= contigInfo.getSequences().size()) { return null; } else - return verifyGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(current.getContigIndex() + 1), current.getContigIndex() + 1, 1, 1)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(current.getContigIndex() + 1), current.getContigIndex() + 1, 1, 1)); } /** @@ -522,7 +577,7 @@ public class GenomeLocParser { if ((index = contigInfo.getSequenceIndex(contig)) < 0) { throw new StingException("Contig name ( " + contig + " ) not in the set sequence dictionary."); } - return verifyGenomeLoc(new GenomeLoc(contig, index, loc.start, loc.getStop())); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, index, loc.start, loc.getStop())); } /** @@ -535,7 +590,7 @@ public class GenomeLocParser { if ((contig >= GenomeLocParser.contigInfo.getSequences().size()) || (contig < 0)) { throw new StingException("Contig index ( " + contig + " ) is not in the sequence dictionary set."); } - return verifyGenomeLoc(new GenomeLoc(GenomeLocParser.contigInfo.getSequence(contig).getSequenceName(), contig, loc.start, loc.getStop())); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(GenomeLocParser.contigInfo.getSequence(contig).getSequenceName(), contig, loc.start, loc.getStop())); } @@ -556,7 +611,7 @@ public class GenomeLocParser { if (start > length) { throw new StingException("start value of " + start + " is greater than the contig length, and is not -1. (length = " + length + ")."); } - return verifyGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop())); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop())); } /** @@ -575,7 +630,7 @@ public class GenomeLocParser { if ((stop != -1) && (stop > GenomeLocParser.contigInfo.getSequences().get(loc.getContigIndex()).getSequenceLength())) { throw new StingException("stop value of " + stop + " is greater than the contig length, and is not -1."); } - return verifyGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop)); } /** @@ -598,7 +653,7 @@ public class GenomeLocParser { * @return a new genome loc */ public static GenomeLoc incPos(GenomeLoc loc, long by) { - return verifyGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by)); } /** @@ -629,8 +684,8 @@ public class GenomeLocParser { */ public static int compareContigs(String firstContig, String secondContig) { checkSetup(); - Integer ref1 = GenomeLocParser.getContigIndex(firstContig); - Integer ref2 = GenomeLocParser.getContigIndex(secondContig); + Integer ref1 = GenomeLocParser.getContigIndex(firstContig,true); + Integer ref2 = GenomeLocParser.getContigIndex(secondContig,true); return ref1.compareTo(ref2); } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java index 8f30428d8..2580b143e 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.utils; -import static junit.framework.Assert.assertTrue; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; @@ -8,6 +7,7 @@ import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.junit.BeforeClass; import org.junit.Test; @@ -47,13 +47,13 @@ public class GenomeLocParserTest extends BaseTest { @Test(expected = RuntimeException.class) public void testGetContigIndex() { - assertEquals(-1, GenomeLocParser.getContigIndex("blah")); // should be in the reference + assertEquals(-1, GenomeLocParser.getContigIndex("blah",true)); // should not be in the reference } @Test public void testGetContigIndexValid() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); - assertEquals(0, GenomeLocParser.getContigIndex("chr1")); // should be in the reference + assertEquals(0, GenomeLocParser.getContigIndex("chr1",true)); // should be in the reference } @Test @@ -219,4 +219,23 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } + + // test out the validating methods + @Test + public void testValidationOfGenomeLocs() { + assertTrue(GenomeLocParser.validGenomeLoc("chr1",1,1)); + assertTrue(!GenomeLocParser.validGenomeLoc("chr2",1,1)); // shouldn't have an entry + assertTrue(!GenomeLocParser.validGenomeLoc("chr1",1,11)); // past the end of the contig + assertTrue(!GenomeLocParser.validGenomeLoc("chr1",-1,10)); // bad start + assertTrue(!GenomeLocParser.validGenomeLoc("chr1",1,-2)); // bad stop + assertTrue(!GenomeLocParser.validGenomeLoc("chr1",10,11)); // bad start, past end + + assertTrue(GenomeLocParser.validGenomeLoc(0,1,1)); + assertTrue(!GenomeLocParser.validGenomeLoc(1,1,1)); // shouldn't have an entry + assertTrue(!GenomeLocParser.validGenomeLoc(0,1,11)); // past the end of the contig + assertTrue(!GenomeLocParser.validGenomeLoc(-1,0,10)); // bad start + assertTrue(!GenomeLocParser.validGenomeLoc(0,1,-2)); // bad stop + assertTrue(!GenomeLocParser.validGenomeLoc(0,10,11)); // bad start, past end + + } }