From ce72932a454fec661d12c410ed377adacc680933 Mon Sep 17 00:00:00 2001 From: kcibul Date: Sun, 12 Apr 2009 02:25:17 +0000 Subject: [PATCH] * refactored GenomeLoc to use contigIndex internally for performance and fixed several calling classes * added basic unit test for GenomeLoc * fixed bug when parsing genome locations like chr1:5000 the start position was being left as maxint rather than being set to the same as the stop position. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@365 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/iterators/LocusIteratorByHanger.java | 4 +- .../gatk/walkers/SomaticCoverageWalker.java | 21 +++-- .../broadinstitute/sting/utils/GenomeLoc.java | 62 ++++++++------- .../sting/utils/GenomeLocTest.java | 77 +++++++++++++++++++ 4 files changed, 128 insertions(+), 36 deletions(-) create mode 100644 java/test/org/broadinstitute/sting/utils/GenomeLocTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java index afa9f873c..0e361f193 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java @@ -113,7 +113,7 @@ public class LocusIteratorByHanger extends LocusIterator { } protected void hangRead(final SAMRecord read) { - GenomeLoc readLoc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); + GenomeLoc readLoc = new GenomeLoc(read.getReferenceIndex(), read.getAlignmentStart()); //System.out.printf("Adding read %s at %d%n", read.getReadName(), read.getAlignmentStart()); /* @@ -128,7 +128,7 @@ public class LocusIteratorByHanger extends LocusIterator { if ( DEBUG ) logger.debug(String.format("Processing block %s len=%d%n", block, block.getLength())); for ( int i = 0; i < block.getLength(); i++ ) { - GenomeLoc offset = new GenomeLoc(readLoc.getContig(), block.getReferenceStart() + i); + GenomeLoc offset = new GenomeLoc(readLoc.getContigIndex(), block.getReferenceStart() + i); readHanger.expandingPut(offset, read); offsetHanger.expandingPut(offset, block.getReadStart() + i - 1); if ( DEBUG ) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SomaticCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SomaticCoverageWalker.java index 2536fab2a..b752314b9 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SomaticCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SomaticCoverageWalker.java @@ -43,8 +43,10 @@ public class SomaticCoverageWalker extends LocusWalker { int tumorCovered; int normalCovered; int somaticCovered; + long start = 0; public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + if (start ==0) { start = System.currentTimeMillis(); } List reads = context.getReads(); int tumorDepth = 0; @@ -58,10 +60,10 @@ public class SomaticCoverageWalker extends LocusWalker { // that come from fully mapped pairs with a mappign quality threshold >= x if (read.getNotPrimaryAlignmentFlag() || read.getDuplicateReadFlag() || - read.getReadUnmappedFlag() - || - read.getMateUnmappedFlag() || + read.getReadUnmappedFlag() || read.getMappingQuality() < MAPPING_QUALITY_THRESHOLD +// || +// read.getMateUnmappedFlag() || ) { continue; } @@ -90,10 +92,15 @@ public class SomaticCoverageWalker extends LocusWalker { if (isTumorCovered && isNormalCovered) {somaticCovered++; } totalSites++; - if (totalSites % 20000 == 0) { - out.println(String.format("%s:%d %d %d %d %d", context.getContig(), context.getPosition(), totalSites, tumorCovered, normalCovered, somaticCovered)); - } +// if (totalSites % 100000 == 0) { +// long now = System.currentTimeMillis(); +// out.println(String.format("%s:%d %d %d %d %d %dms", context.getContig(), context.getPosition(), totalSites, tumorCovered, normalCovered, somaticCovered, (now-start))); +// } + StringBuilder sb = new StringBuilder(); + sb.append(context.getContig()).append(" "); + sb.append(context.getPosition()); + out.println(sb.toString()); return 1; } @@ -107,7 +114,7 @@ public class SomaticCoverageWalker extends LocusWalker { @Override public void onTraversalDone(Integer result) { - out.println(String.format("FINAL - %d %d %d %d", totalSites, tumorCovered, normalCovered, somaticCovered)); +// out.println(String.format("FINAL - %d %d %d %d", totalSites, tumorCovered, normalCovered, somaticCovered)); } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 8bd1d319c..54a59123d 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -30,7 +30,7 @@ import org.apache.log4j.Logger; public class GenomeLoc implements Comparable { private static Logger logger = Logger.getLogger(GenomeLoc.class); - private String contig; + private Integer contigIndex; private long start; private long stop; @@ -41,7 +41,6 @@ public class GenomeLoc implements Comparable { // -------------------------------------------------------------------------------------------------------------- //public static Map refContigOrdering = null; private static SAMSequenceDictionary contigInfo = null; - private static HashMap interns = null; public static boolean hasKnownContigOrdering() { return contigInfo != null; @@ -52,6 +51,7 @@ public class GenomeLoc implements Comparable { } public static int getContigIndex( final String contig ) { + // if the conig isn't found, a null is returned which will NPE here (which is what we want right now) return contigInfo.getSequenceIndex(contig); } @@ -113,28 +113,32 @@ public class GenomeLoc implements Comparable { // constructors // // -------------------------------------------------------------------------------------------------------------- - public GenomeLoc( String contig, final long start, final long stop ) { - if ( interns != null ) - contig = interns.get(contig); + public GenomeLoc( int contigIndex, final long start, final long stop ) { - this.contig = contig; + this.contigIndex = contigIndex; this.start = start; this.stop = stop; } + public GenomeLoc( String contig, final long start, final long stop ) { + this(contigInfo.getSequenceIndex(contig), start, stop); + } + public GenomeLoc( final String contig, final long pos ) { - this( contig, pos, pos ); + this(contig, pos, pos ); + } + + public GenomeLoc( final int contig, final long pos ) { + this(contig, pos, pos ); } public GenomeLoc( final GenomeLoc toCopy ) { - this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() ); + this( toCopy.contigIndex, toCopy.getStart(), toCopy.getStop() ); } + // TODO: why isn't this just a constructor? public static GenomeLoc genomicLocationOf(final SAMRecord read) { - String contig = read.getReferenceName(); - if (read.getReadUnmappedFlag()) - contig = null; - return new GenomeLoc(contig, read.getAlignmentStart(), read.getAlignmentEnd()); + return new GenomeLoc(read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd()); } // -------------------------------------------------------------------------------------------------------------- @@ -171,6 +175,7 @@ public class GenomeLoc implements Comparable { else if ( match2.matches() ) { contig = match2.group(1); start = parsePosition(match2.group(2)); + stop = start; } else if ( match3.matches() ) { contig = match3.group(1); @@ -304,7 +309,7 @@ public class GenomeLoc implements Comparable { return returnTrueIfEmpty; // skip loci before intervals begin - if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) ) + if ( hasKnownContigOrdering() && curr.contigIndex < locs.get(0).contigIndex ) return false; for ( GenomeLoc loc : locs ) { @@ -320,7 +325,8 @@ public class GenomeLoc implements Comparable { // // Accessors and setters // - public final String getContig() { return this.contig; } + public final String getContig() { return contigInfo.getSequence(this.contigIndex).getSequenceName(); } + public final int getContigIndex() { return this.contigIndex; } public final long getStart() { return this.start; } public final long getStop() { return this.stop; } public final String toString() { @@ -333,12 +339,12 @@ public class GenomeLoc implements Comparable { } - public final boolean isUnmapped() { return this.contig == null; } + public final boolean isUnmapped() { return this.contigIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } public final boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } public final boolean atBeginningOfContigP() { return this.start == 1; } public void setContig(String contig) { - this.contig = contig; + this.contigIndex = contigInfo.getSequenceIndex(contig); } public void setStart(long start) { @@ -351,16 +357,16 @@ public class GenomeLoc implements Comparable { public final boolean isSingleBP() { return stop == start; } public final boolean disjointP(GenomeLoc that) { - if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes - if ( this.start > that.stop ) return true; // this guy is past that - if ( that.start > this.stop ) return true; // that guy is past our start + if ( this.contigIndex != that.contigIndex ) return true; // different chromosomes + if ( this.start > that.stop ) return true; // this guy is past that + if ( that.start > this.stop ) return true; // that guy is past our start return false; } public final boolean discontinuousP(GenomeLoc that) { - if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes - if ( (this.start - 1) > that.stop ) return true; // this guy is past that - if ( (that.start - 1) > this.stop ) return true; // that guy is past our start + if ( this.contigIndex != that.contigIndex ) return true; // different chromosomes + if ( (this.start - 1) > that.stop ) return true; // this guy is past that + if ( (that.start - 1) > this.stop ) return true; // that guy is past our start return false; } @@ -386,11 +392,11 @@ public class GenomeLoc implements Comparable { } public final boolean onSameContig(GenomeLoc that) { - return this.contig.equals(that.contig); + return (this.contigIndex == that.contigIndex); } public final int minus( final GenomeLoc that ) { - if ( this.getContig().equals(that.getContig()) ) + if ( this.contigIndex == that.contigIndex ) return (int) (this.getStart() - that.getStart()); else return Integer.MAX_VALUE; @@ -425,6 +431,7 @@ public class GenomeLoc implements Comparable { // // Comparison operations // + // TODO: get rid of this method because it's sloooooooooooooow public static int compareContigs( final String thisContig, final String thatContig ) { if ( thisContig == thatContig ) @@ -471,15 +478,16 @@ public class GenomeLoc implements Comparable { } } - public int compareContigs( GenomeLoc that ) { - return compareContigs( this.contig, that.contig ); + public final int compareContigs( GenomeLoc that ) { + return (this.contigIndex == that.contigIndex)?0:((this.contigIndex < that.contigIndex)?-1:1); } public int compareTo( GenomeLoc that ) { if ( this == that ) return 0; - final int cmpContig = compareContigs( this.getContig(), that.getContig() ); + final int cmpContig = compareContigs(that); + if ( cmpContig != 0 ) return cmpContig; if ( this.getStart() < that.getStart() ) return -1; if ( this.getStart() > that.getStart() ) return 1; diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocTest.java new file mode 100644 index 000000000..5e7eb603d --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocTest.java @@ -0,0 +1,77 @@ +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + +import org.junit.*; +import static org.junit.Assert.*; +import org.apache.commons.cli.ParseException; +import org.broadinstitute.sting.utils.cmdLine.ArgumentParser; + +import java.util.ArrayList; +import java.io.File; + +/** + * Basic unit test for GenomeLoc + */ +public class GenomeLocTest { + private static FastaSequenceFile2 seq; + + @BeforeClass + public static void init() { + // sequence + seq = new FastaSequenceFile2(new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } + + /** + * Tests that we got a string parameter in correctly + */ + @Test + public void testIsBetween() { + GenomeLoc.setupRefContigOrdering(seq); + GenomeLoc locMiddle = new GenomeLoc("chr1", 3, 3); + + GenomeLoc locLeft = new GenomeLoc("chr1", 1, 1); + GenomeLoc locRight = new GenomeLoc("chr1", 5, 5); + + Assert.assertTrue(locMiddle.isBetween(locLeft, locRight)); + Assert.assertFalse(locLeft.isBetween(locMiddle, locRight)); + Assert.assertFalse(locRight.isBetween(locLeft, locMiddle)); + + } + @Test + public void testContigIndex() { + GenomeLoc locOne = new GenomeLoc("chr1",1,1); + Assert.assertEquals(locOne.getContigIndex(), 1); + Assert.assertEquals(locOne.getContig(), "chr1"); + + GenomeLoc locX = new GenomeLoc("chrX",1,1); + Assert.assertEquals(locX.getContigIndex(), 23); + Assert.assertEquals(locX.getContig(), "chrX"); + + GenomeLoc locNumber = new GenomeLoc(1,1,1); + Assert.assertEquals(locNumber.getContigIndex(), 1); + Assert.assertEquals(locNumber.getContig(), "chr1"); + Assert.assertEquals(locOne.compareTo(locNumber), 0); + + } + + @Test + public void testCompareTo() { + GenomeLoc twoOne = new GenomeLoc("chr2", 1); + GenomeLoc twoFive = new GenomeLoc("chr2", 5); + GenomeLoc twoOtherFive = new GenomeLoc("chr2", 5); + Assert.assertEquals(0, twoFive.compareTo(twoOtherFive)); + + Assert.assertEquals(-1, twoOne.compareTo(twoFive)); + Assert.assertEquals(1, twoFive.compareTo(twoOne)); + + GenomeLoc oneOne = new GenomeLoc("chr1", 5); + Assert.assertEquals(-1, oneOne.compareTo(twoOne)); + Assert.assertEquals(1, twoOne.compareTo(oneOne)); + } + + + +} \ No newline at end of file