From 9801dd114fa66250e9f652f3d0ab55af1bc18891 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 27 Apr 2012 09:58:38 -0400 Subject: [PATCH] Bug fix for: https://getsatisfaction.com/gsa/topics/problem_with_indelrealigner_and_l_unmapped The GATK -L unmapped is for GenomeLocs with SAMRecord.NO_ALIGNMENT_REFERENCE_NAME, not SAMRecord.getReadUnmappedFlag() Previously unmapped flag reads in the last bin were being printed while also seeking for the reads without a reference contig. --- .../IntervalOverlapFilteringIterator.java | 5 +- .../sting/utils/sam/AlignmentUtils.java | 11 ++ .../sting/utils/sam/ArtificialSAMUtils.java | 27 ++++ ...ervalOverlapFilteringIteratorUnitTest.java | 149 ++++++++++++++++++ .../walkers/PrintReadsIntegrationTest.java | 51 +++--- .../utils/sam/AlignmentUtilsUnitTest.java | 123 +++++++++++++++ 6 files changed, 340 insertions(+), 26 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java index 4005f1c32..87b356fce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java @@ -28,6 +28,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.List; import java.util.NoSuchElementException; @@ -154,8 +155,8 @@ class IntervalOverlapFilteringIterator implements CloseableIterator { } } else { - // Found an unmapped read. We're done. - if(candidateRead.getReadUnmappedFlag()) { + // Found a -L UNMAPPED read. NOTE: this is different than just being flagged as unmapped! We're done. + if(AlignmentUtils.isReadGenomeLocUnmapped(candidateRead)) { nextRead = candidateRead; break; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index e0fee66ef..998045a8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -525,6 +525,17 @@ public class AlignmentUtils { return alignment; } + /** + * Returns true if the read does not belong to a contig, i.e. it's location is GenomeLoc.UNMAPPED. + * NOTE: A read can have a mapped GenomeLoc and still have an unmapped flag! + * + * @param r record + * @return true if read is unmapped to a genome loc + */ + public static boolean isReadGenomeLocUnmapped(final SAMRecord r) { + return SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(r.getReferenceName()); + } + /** * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 0d3777701..d0211db07 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -334,6 +334,33 @@ public class ArtificialSAMUtils { return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(SAMRecord... reads) { + return createReadIterator(Arrays.asList(reads)); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(List reads) { + final Iterator iter = reads.iterator(); + return new StingSAMIterator() { + @Override public void close() {} + @Override public Iterator iterator() { return iter; } + @Override public boolean hasNext() { return iter.hasNext(); } + @Override public SAMRecord next() { return iter.next(); } + @Override public void remove() { iter.remove(); } + }; + } + private final static int ranIntInclusive(Random ran, int start, int stop) { final int range = stop - start; return ran.nextInt(range) + start; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java new file mode 100644 index 000000000..1a5e99915 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class IntervalOverlapFilteringIteratorUnitTest { + + private SAMFileHeader header; + private GenomeLoc firstContig; + private GenomeLoc secondContig; + + /** Basic aligned and mapped read. */ + private SAMRecord readMapped; + + /** Read with no contig specified in the read, -L UNMAPPED */ + private SAMRecord readNoReference; + + /** This read has a start position, but is flagged that it's not mapped. */ + private SAMRecord readUnmappedFlag; + + /** This read is from the second contig. */ + private SAMRecord readSecondContig; + + /** This read says it's aligned, but actually has an unknown start. */ + private SAMRecord readUnknownStart; + + /** The above reads in the order one would expect to find them in a sorted BAM. */ + private List testReads; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, ArtificialSAMUtils.DEFAULT_READ_LENGTH * 2); + GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + SAMSequenceRecord record; + + record = header.getSequence(0); + firstContig = genomeLocParser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()); + record = header.getSequence(1); + secondContig = genomeLocParser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()); + + readMapped = createMappedRead("mapped", 1); + + readUnmappedFlag = createMappedRead("unmappedFlagged", 2); + readUnmappedFlag.setReadUnmappedFlag(true); + + readSecondContig = createMappedRead("secondContig", 3); + readSecondContig.setReferenceName(secondContig.getContig()); + + /* This read says it's aligned, but to a contig not in the header. */ + SAMRecord readUnknownContig = createMappedRead("unknownContig", 4); + readUnknownContig.setReferenceName("unknownContig"); + + readUnknownStart = createMappedRead("unknownStart", 1); + readUnknownStart.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + + readNoReference = createUnmappedRead("unmappedNoReference"); + + testReads = new ArrayList(); + testReads.add(readMapped); + testReads.add(readUnmappedFlag); + testReads.add(readUnknownStart); + testReads.add(readSecondContig); + testReads.add(readUnknownContig); + testReads.add(readNoReference); + } + + @DataProvider(name = "filteringIteratorTestData") + public Object[][] getFilteringIteratorTestData() { + return new Object[][] { + new Object[] {Arrays.asList(firstContig), Arrays.asList(readMapped, readUnmappedFlag, readUnknownStart)}, + new Object[] {Arrays.asList(GenomeLoc.UNMAPPED), Arrays.asList(readNoReference)}, + new Object[] {Arrays.asList(firstContig, secondContig), Arrays.asList(readMapped, readUnmappedFlag, readUnknownStart, readSecondContig)} + }; + } + + @Test(dataProvider = "filteringIteratorTestData") + public void testFilteringIterator(List locs, List expected) { + IntervalOverlapFilteringIterator filterIter = new IntervalOverlapFilteringIterator( + ArtificialSAMUtils.createReadIterator(testReads), locs); + + List actual = new ArrayList(); + while (filterIter.hasNext()) { + actual.add(filterIter.next()); + } + Assert.assertEquals(actual, expected); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testMappedAndUnmapped() { + new IntervalOverlapFilteringIterator( + ArtificialSAMUtils.createReadIterator(testReads), + Arrays.asList(firstContig, GenomeLoc.UNMAPPED)); + } + + private SAMRecord createUnmappedRead(String name) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } + + private SAMRecord createMappedRead(String name, int start) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + 0, + start, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java index a35348693..4b4946835 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java @@ -5,49 +5,52 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; -import java.util.HashMap; public class PrintReadsIntegrationTest extends WalkerTest { private static class PRTest { - final static String REF = hg18Reference; - final static String BAM = validationDataLocation + "HiSeq.1mb.bam"; - String args; - String md5; + final String reference; + final String bam; + final String args; + final String md5; - private PRTest(String args, String md5) { + private PRTest(String reference, String bam, String args, String md5) { + this.reference = reference; + this.bam = bam; this.args = args; this.md5 = md5; } + + @Override + public String toString() { + return String.format("PRTest(bam='%s', args='%s')", bam, args); + } } @DataProvider(name = "PRTest") - public Object[][] createData1() { + public Object[][] createPrintReadsTestData() { return new Object[][]{ - {new PRTest("", "dc8e5451dd29757c336013146010f73a")}, - {new PRTest(" -compress 0", "fde82269c78c9e91e57286433531b4af")}, - {new PRTest(" -simplifyBAM", "0531717b32a7e21c0de70b1526b0751f")}, - {new PRTest(" -n 10", "cdc4ddf9ee1d2ecf37168da8ef23c270")} }; + {new PRTest(hg18Reference, "HiSeq.1mb.bam", "", "dc8e5451dd29757c336013146010f73a")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -compress 0", "fde82269c78c9e91e57286433531b4af")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -simplifyBAM", "0531717b32a7e21c0de70b1526b0751f")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -n 10", "cdc4ddf9ee1d2ecf37168da8ef23c270")}, + // See: GATKBAMIndex.getStartOfLastLinearBin(), BAMScheduler.advance(), IntervalOverlapFilteringIterator.advance() + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "0a9ce949d07a84cb33a1a8e3358bf679")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e920b8505e7e95d67634b0905237dbc")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "13bb9a91b1d4dd2425f73302b8a1ac1c")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e920b8505e7e95d67634b0905237dbc")}, + {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")} + }; } @Test(dataProvider = "PRTest") public void testPrintReads(PRTest params) { WalkerTestSpec spec = new WalkerTestSpec( - "-T PrintReads -R " + params.REF + - " -I " + params.BAM + + "-T PrintReads" + + " -R " + params.reference + + " -I " + validationDataLocation + params.bam + params.args + " -o %s", Arrays.asList(params.md5)); executeTest("testPrintReads-"+params.args, spec).getFirst(); } - - @Test - public void testPrintReadsReadAllInsertion() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T PrintReads -R " + b37KGReference + - " -I " + validationDataLocation + "oneReadAllInsertion.bam" + - " -o %s", - Arrays.asList("6caec4f8a25befb6aba562955401af93")); - executeTest("testPrintReads-oneReadAllInsertion", spec); - } } - diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java new file mode 100644 index 000000000..5a8582fb2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import junit.framework.Assert; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class AlignmentUtilsUnitTest { + private SAMFileHeader header; + + /** Basic aligned and mapped read. */ + private SAMRecord readMapped; + + /** Read with no contig specified in the read, -L UNMAPPED */ + private SAMRecord readNoReference; + + /** This read has a start position, but is flagged that it's not mapped. */ + private SAMRecord readUnmappedFlag; + + /** This read says it's aligned, but to a contig not in the header. */ + private SAMRecord readUnknownContig; + + /** This read says it's aligned, but actually has an unknown start. */ + private SAMRecord readUnknownStart; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, ArtificialSAMUtils.DEFAULT_READ_LENGTH * 2); + + readMapped = createMappedRead("mapped", 1); + + readNoReference = createUnmappedRead("unmappedNoReference"); + + readUnmappedFlag = createMappedRead("unmappedFlagged", 2); + readUnmappedFlag.setReadUnmappedFlag(true); + + readUnknownContig = createMappedRead("unknownContig", 3); + readUnknownContig.setReferenceName("unknownContig"); + + readUnknownStart = createMappedRead("unknownStart", 1); + readUnknownStart.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + } + + /** + * Test for -L UNMAPPED + */ + @DataProvider(name = "genomeLocUnmappedReadTests") + public Object[][] getGenomeLocUnmappedReadTests() { + return new Object[][] { + new Object[] {readNoReference, true}, + new Object[] {readMapped, false}, + new Object[] {readUnmappedFlag, false}, + new Object[] {readUnknownContig, false}, + new Object[] {readUnknownStart, false} + }; + } + @Test(dataProvider = "genomeLocUnmappedReadTests") + public void testIsReadGenomeLocUnmapped(SAMRecord read, boolean expected) { + Assert.assertEquals(AlignmentUtils.isReadGenomeLocUnmapped(read), expected); + } + + /** + * Test for read being truly unmapped + */ + @DataProvider(name = "unmappedReadTests") + public Object[][] getUnmappedReadTests() { + return new Object[][] { + new Object[] {readNoReference, true}, + new Object[] {readMapped, false}, + new Object[] {readUnmappedFlag, true}, + new Object[] {readUnknownContig, false}, + new Object[] {readUnknownStart, true} + }; + } + @Test(dataProvider = "unmappedReadTests") + public void testIsReadUnmapped(SAMRecord read, boolean expected) { + Assert.assertEquals(AlignmentUtils.isReadUnmapped(read), expected); + } + + private SAMRecord createUnmappedRead(String name) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } + + private SAMRecord createMappedRead(String name, int start) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + 0, + start, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } +}