Better query start / stop function that directly parses the cigar string, unlike the previous version. Now properly handles H (hard-clipped) reads. Added -baq OFF and -baq RECALCULATE integration tests on all three 1KG technologies. Please let me know if this new code somehow fails.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-01-28 15:08:21 +00:00
parent 9cb1ae384c
commit 2182b8c7e2
2 changed files with 92 additions and 4 deletions

View File

@ -1,10 +1,12 @@
package org.broadinstitute.sting.utils.baq;
import net.sf.samtools.Cigar;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -159,8 +161,6 @@ public class BAQ {
// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE
// NOTE -- PUSHED BACK TO HENG LI
//
// Note that _ref and _query are in the special 0-4 encoding [see above for docs]
//
// ####################################################################################################
public int hmm_glocal(final byte[] ref, final byte[] query, int qstart, int l_query, final byte[] _iqual, int[] state, byte[] q) {
if ( ref == null ) throw new ReviewedStingException("BUG: ref sequence is null");
@ -491,6 +491,10 @@ public class BAQ {
public static int MAG = 1; // todo -- remove me for performance testing only
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) {
if ( queryStart < 0 ) throw new ReviewedStingException("BUG: queryStart < 0: " + queryStart);
if ( queryEnd < 0 ) throw new ReviewedStingException("BUG: queryEnd < 0: " + queryEnd);
if ( queryEnd < queryStart ) throw new ReviewedStingException("BUG: queryStart < queryEnd : " + queryStart + " end =" + queryEnd);
// note -- assumes ref is offset from the *CLIPPED* start
BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref);
int queryLen = queryEnd - queryStart;
@ -499,11 +503,65 @@ public class BAQ {
return baqResult;
}
/**
* Determine the appropriate start and stop offsets in the reads for the bases given the cigar string
* @param read
* @return
*/
private final Pair<Integer,Integer> calculateQueryRange(SAMRecord read) {
return calculateQueryRangeCigar(read);
// comparison of old and new way of doing it
// Pair<Integer, Integer> cigarResult = calculateQueryRangeCigar(read);
// Pair<Integer, Integer> prevResult = calculateQueryRangeAlignment(read);
//
// if ( ! cigarResult.equals(prevResult) ) {
// throw new ReviewedStingException("BUG: cigar and prev results differ at read " + read + ", aligment start " + read.getAlignmentStart());
// }
//
// return true ? cigarResult : prevResult;
}
private final Pair<Integer,Integer> calculateQueryRangeCigar(SAMRecord read) {
int queryStart = -1, queryStop = -1;
int readI = 0;
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
switch (elt.getOperator()) {
case N: return null; // cannot handle these
case H : case P : case D: break; // ignore pads, hard clips, and deletions
case I : case S: case M:
int prev = readI;
readI += elt.getLength();
if ( includeClippedBases || elt.getOperator() != CigarOperator.S) {
if ( queryStart == -1 )
queryStart = prev;
queryStop = readI;
}
// in the else case we aren't including soft clipped bases, so we don't update
// queryStart or queryStop
break;
default: throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
}
}
return new Pair<Integer, Integer>(queryStart, queryStop);
}
private final Pair<Integer,Integer> calculateQueryRangeAlignment(SAMRecord read) {
int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart();
int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd());
return new Pair<Integer, Integer>(queryStart, queryEnd);
}
// we need to bad ref by at least the bandwidth / 2 on either side
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
// todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart();
int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd());
Pair<Integer, Integer> queryRange = calculateQueryRange(read);
if ( queryRange == null ) return null; // read has Ns
int queryStart = queryRange.getFirst();
int queryEnd = queryRange.getSecond();
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);
// cap quals

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class BAQIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T PrintReads -R " + b36KGReference +
" -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
" -o %s" +
" -L 1:10,000,000-10,100,000";
// --------------------------------------------------------------------------------------------------------------
//
// testing BAQ with SLX, 454, and SOLID data
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testPrintReadsNoBAQ() {
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("902197bf77ed5a828d50e08771685928"));
executeTest(String.format("testPrintReadsNoBAQ"), spec);
}
@Test
public void testPrintReadsRecalBAQ() {
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq RECALCULATE", 1, Arrays.asList("4ac691bde1ba1301a59857694fda6ae2"));
executeTest(String.format("testPrintReadsRecalBAQ"), spec);
}
}