Fix for indel calling with UG in presence of reduced reads: When a read is long enough so that there's no reference context available, the reads gets clipped so that it falls again within the reference context range. However, the clipping is incorrect, as it makes the read end precisely at the end of the reference context coordinates. This might lead to a case where a read might span beyond the haplotype if one of the candidate haplotypes is shorter than the reference context (As in the case e.g. with deletions). In this case, the HMM will not work properly and the likelihood will be bad, since "insertions" at end of reads when haplotype is done will be penalized and likelihood will be much lower than it should.

-- Added check to see if read spans beyond reference window MINUS padding and event length. This guarantees that read will always be contained in haplotype.
-- Changed md5's that happen when long reads from old 454 data have their likelihoods changed because of the extra base clipping.
This commit is contained in:
Guillermo del Angel 2013-04-28 09:14:35 -04:00
parent c5701a9ade
commit 20d3137928
4 changed files with 15 additions and 11 deletions

View File

@ -245,8 +245,13 @@ public class PairHMMIndelErrorModel {
}
}
else {
final int refWindowStart = ref.getWindow().getStart();
final int refWindowStop = ref.getWindow().getStop();
// extra padding on candidate haplotypes to make sure reads are always strictly contained
// in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly
// in case bases at edge of reads have lower quality.
final int trailingBases = 3;
final int extraOffset = Math.abs(eventLength);
final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset);
final int refWindowStop = ref.getWindow().getStop()-(trailingBases+extraOffset);
if (DEBUG) {
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
@ -255,10 +260,10 @@ public class PairHMMIndelErrorModel {
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop))
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop);
if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart))
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart);
if (read.isEmpty())
continue;
@ -270,7 +275,6 @@ public class PairHMMIndelErrorModel {
continue;
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
final long readStart = read.getSoftStart();
final long readEnd = read.getSoftEnd();
@ -286,7 +290,6 @@ public class PairHMMIndelErrorModel {
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
final byte [] unclippedReadBases = read.getReadBases();
final byte [] unclippedReadQuals = read.getBaseQualities();
final int extraOffset = Math.abs(eventLength);
/**
* Compute genomic locations that candidate haplotypes will span.
@ -313,6 +316,7 @@ public class PairHMMIndelErrorModel {
startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely;
}
// candidate haplotype cannot go beyond reference context
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
}

View File

@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("54e13f696f56eb742bf449ad11d0dc5f"));
Arrays.asList("8d9b8f8a1479322961c840e461b6dba8"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("e87f5c76661527ef7aa44e528fe19573"));
Arrays.asList("3d4d66cc253eac55f16e5b0a36f17d8d"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

View File

@ -240,7 +240,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("d995b76adc3766889f7c2c88da14055c"));
Arrays.asList("31be725b2a7c15e9769391ad940c0587"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -259,7 +259,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("9669e1643d22d5ad047b3941aeefd6db"));
Arrays.asList("dcc5cec42730567982def16da4a7f286"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("e3efd1917192ea743ac1e9958aa0a98f"));
Arrays.asList("a6c224235c21b4af816b1512eb0624df"));
executeTest("test MultiSample Pilot1", spec);
}