From 2182b8c7e233cc47571be51cf5388e3815f946db Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 28 Jan 2011 15:08:21 +0000 Subject: [PATCH] Better query start / stop function that directly parses the cigar string, unlike the previous version. Now properly handles H (hard-clipped) reads. Added -baq OFF and -baq RECALCULATE integration tests on all three 1KG technologies. Please let me know if this new code somehow fails. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5108 348d0f76-0448-11de-a6fe-93d51630548a --- .../broadinstitute/sting/utils/baq/BAQ.java | 66 +++++++++++++++++-- .../gatk/walkers/BAQIntegrationTest.java | 30 +++++++++ 2 files changed, 92 insertions(+), 4 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index c2fd6e87f..0764b82f7 100644 --- a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -1,10 +1,12 @@ package org.broadinstitute.sting.utils.baq; +import net.sf.samtools.Cigar; import net.sf.samtools.SAMRecord; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -159,8 +161,6 @@ public class BAQ { // NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE // NOTE -- PUSHED BACK TO HENG LI // - // Note that _ref and _query are in the special 0-4 encoding [see above for docs] - // // #################################################################################################### public int hmm_glocal(final byte[] ref, final byte[] query, int qstart, int l_query, final byte[] _iqual, int[] state, byte[] q) { if ( ref == null ) throw new ReviewedStingException("BUG: ref sequence is null"); @@ -491,6 +491,10 @@ public class BAQ { public static int MAG = 1; // todo -- remove me for performance testing only public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) { + if ( queryStart < 0 ) throw new ReviewedStingException("BUG: queryStart < 0: " + queryStart); + if ( queryEnd < 0 ) throw new ReviewedStingException("BUG: queryEnd < 0: " + queryEnd); + if ( queryEnd < queryStart ) throw new ReviewedStingException("BUG: queryStart < queryEnd : " + queryStart + " end =" + queryEnd); + // note -- assumes ref is offset from the *CLIPPED* start BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref); int queryLen = queryEnd - queryStart; @@ -499,11 +503,65 @@ public class BAQ { return baqResult; } + + /** + * Determine the appropriate start and stop offsets in the reads for the bases given the cigar string + * @param read + * @return + */ + private final Pair calculateQueryRange(SAMRecord read) { + return calculateQueryRangeCigar(read); + // comparison of old and new way of doing it +// Pair cigarResult = calculateQueryRangeCigar(read); +// Pair prevResult = calculateQueryRangeAlignment(read); +// +// if ( ! cigarResult.equals(prevResult) ) { +// throw new ReviewedStingException("BUG: cigar and prev results differ at read " + read + ", aligment start " + read.getAlignmentStart()); +// } +// +// return true ? cigarResult : prevResult; + } + + private final Pair calculateQueryRangeCigar(SAMRecord read) { + int queryStart = -1, queryStop = -1; + int readI = 0; + + for ( CigarElement elt : read.getCigar().getCigarElements() ) { + switch (elt.getOperator()) { + case N: return null; // cannot handle these + case H : case P : case D: break; // ignore pads, hard clips, and deletions + case I : case S: case M: + int prev = readI; + readI += elt.getLength(); + if ( includeClippedBases || elt.getOperator() != CigarOperator.S) { + if ( queryStart == -1 ) + queryStart = prev; + queryStop = readI; + } + // in the else case we aren't including soft clipped bases, so we don't update + // queryStart or queryStop + break; + default: throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); + } + } + return new Pair(queryStart, queryStop); + } + + private final Pair calculateQueryRangeAlignment(SAMRecord read) { + int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart(); + int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd()); + return new Pair(queryStart, queryEnd); + } + // we need to bad ref by at least the bandwidth / 2 on either side public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read - int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart(); - int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd()); + Pair queryRange = calculateQueryRange(read); + if ( queryRange == null ) return null; // read has Ns + + int queryStart = queryRange.getFirst(); + int queryEnd = queryRange.getSecond(); + BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd); // cap quals diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java new file mode 100755 index 000000000..702ba9f4f --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class BAQIntegrationTest extends WalkerTest { + private final static String baseCommand = "-T PrintReads -R " + b36KGReference + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000"; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing BAQ with SLX, 454, and SOLID data + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testPrintReadsNoBAQ() { + WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("902197bf77ed5a828d50e08771685928")); + executeTest(String.format("testPrintReadsNoBAQ"), spec); + } + + @Test + public void testPrintReadsRecalBAQ() { + WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq RECALCULATE", 1, Arrays.asList("4ac691bde1ba1301a59857694fda6ae2")); + executeTest(String.format("testPrintReadsRecalBAQ"), spec); + } +}