Bypass spanning deletions in Rank Sum tests

This commit is contained in:
Ron Levine 2016-07-26 12:55:14 -04:00
parent 01fdb90096
commit abc4d5b7b3
14 changed files with 106 additions and 20 deletions

View File

@ -277,7 +277,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
final GATKSAMRecord read = el.getKey(); final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) { if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a); final Double value = getElementForRead(read, refLoc, a);
if ( value == null ) // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if ( value == null || value < 0.0 )
continue; continue;
if(perAlleleValues.containsKey(a.getMostLikelyAllele())) if(perAlleleValues.containsKey(a.getMostLikelyAllele()))

View File

@ -104,6 +104,8 @@ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_Standard
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0);
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
// Note: For a spanning deletion, readPos is at the upstream end of the deletion and is greater than numAlignedBases (which does not include deletions).
// Hence, the resulting readPos will have a negative value.
if (readPos > numAlignedBases / 2) if (readPos > numAlignedBases / 2)
readPos = numAlignedBases - (readPos + 1); readPos = numAlignedBases - (readPos + 1);
return (double)readPos; return (double)readPos;

View File

@ -76,6 +76,7 @@ import java.util.*;
//TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation //TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false; static final boolean DEBUG = false;
protected static double INVALID_READ_POSITION = -1; // No mapping to a read position
public Map<String, Object> annotate(final RefMetaDataTracker tracker, public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker, final AnnotatorCompatible walker,
@ -86,11 +87,11 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
final GenotypesContext genotypes = vc.getGenotypes(); final GenotypesContext genotypes = vc.getGenotypes();
if (genotypes == null || genotypes.size() == 0) if (genotypes == null || genotypes.isEmpty())
return null; return null;
final ArrayList<Double> refQuals = new ArrayList<>(); final List<Double> refQuals = new ArrayList<>();
final ArrayList<Double> altQuals = new ArrayList<>(); final List<Double> altQuals = new ArrayList<>();
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
@ -183,7 +184,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
final GATKSAMRecord read = el.getKey(); final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) { if ( isUsableRead(read, refLoc) ) {
final Double value = getElementForRead(read, refLoc, a); final Double value = getElementForRead(read, refLoc, a);
if ( value == null ) // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if ( value == null || value == INVALID_READ_POSITION )
continue; continue;
if ( a.getMostLikelyAllele().isReference() ) if ( a.getMostLikelyAllele().isReference() )

View File

@ -104,6 +104,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
return null; return null;
// If the offset inside a deletion, it does not lie on a read.
if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) {
return INVALID_READ_POSITION;
}
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 ); int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
if (readPos > numAlignedBases / 2) if (readPos > numAlignedBases / 2)

View File

@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
//TODO the old MD5 is kept for the record. //TODO the old MD5 is kept for the record.
//TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach.
// executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b99416c04ba951577f43fd2d25f46388"); executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "7421a776c75d0ab5a2ff89d9e7f105ff");
} }
} }

View File

@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","1d27eaa3557dc28c95b9024114d50ad1"); executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","f4092488c9785d800c3f6470af7119ce");
} }
@Test(enabled = true) @Test(enabled = true)

View File

@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("08967b41ccc76b1f3c7093e51a90713a")); Arrays.asList("75b5a925c2009a8c14ea34fff3d04443"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
} }

View File

@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " +
"-A SnpEff", "-A SnpEff",
1, 1,
Arrays.asList("e99f100fe71bb7f328b485204c16f14a")); Arrays.asList("65641c92469ab80513b04144d0eae900"));
executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
} }

View File

@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("c759b04ed0d948bda95008e29f3f5c2d")); Arrays.asList("e7f216d2f9857a579ef3e211076b37a4"));
executeTest("test MultiSample Pilot1", spec); executeTest("test MultiSample Pilot1", spec);
} }

View File

@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleComplex1() { public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "113ae4c0244c50243313a7d6e77da26b"); HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "86528820f8c102c712d9562b83204c05");
} }
private void HCTestSymbolicVariants(String bam, String args, String md5) { private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -96,7 +96,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleGGAComplex() { public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"8f8680bd8e1549ad88691c9c8af9977c"); "828ef27284bd4045148728952b3a7d94");
} }
@Test @Test
@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleConsensusModeComplex() { public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
"353f1895047b15b1fec22b559c9da0c1"); "060eed2610eed818b2ab55d582eb22ec");
} }
} }

View File

@ -453,4 +453,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest("testBadGQBValues", spec); executeTest("testBadGQBValues", spec);
} }
@Test
public void testHaplotypeCallerGVCSpanDel() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("93bc22340e6a4b01a7b96e5a3a12dfc3"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCSpanDel", spec);
}
} }

View File

@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeBAMOutFlags() throws IOException { public void testHaplotypeBAMOutFlags() throws IOException {
HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "08943fb76d1cd5b5b8815e3991754911", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756"); HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "56086abc3bd5e3f7d111f452b7cc4fa1", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756");
} }
@Test @Test
@ -153,17 +153,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerGraphBasedSingleSample() throws IOException { public void testHaplotypeCallerGraphBasedSingleSample() throws IOException {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "ba0dc5f416d69558cb5dd3e0a0a5a084"); HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "8ab21bd6fb7ef37480f556fd5fa5375c");
} }
@Test @Test
public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException { public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "129bca18bb9eec23004b2d28aa541de2"); HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "01220e85ff6bc49e35a325a1df2519e5");
} }
@Test @Test
public void testHaplotypeCallerGraphBasedMultiSample() throws IOException { public void testHaplotypeCallerGraphBasedMultiSample() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "2b89c9e102a049e223bc0d91156a08a3"); HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "80c5b0f72a7962e1ba846ec20465001f");
} }
@Test @Test
@ -398,7 +398,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testLackSensitivityDueToBadHaplotypeSelectionFix() { public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("22f5a3e9366e611509f03c984f8b4960")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5087a8855b3ee9ea1091367674783462"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
} }
@ -484,12 +484,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "34328c475325b7dfaa57ab5920478e0c"); HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "2cf4cab0035d09aa0aec6f3faa2c9df6");
} }
@Test @Test
public void testHBaseCountsBySample() throws IOException{ public void testHBaseCountsBySample() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "f5ad4e03c0faaa806ee6ae536af8a479"); HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52");
} }
@Test @Test

View File

@ -538,6 +538,51 @@ public final class AlignmentUtils {
return alignmentPos; return alignmentPos;
} }
/**
* Is the offset inside a deletion?
*
* @param cigar the read's CIGAR -- cannot be null
* @param offset the offset into the CIGAR
* @return true if the offset is inside a deletion, false otherwise
*/
public static boolean isInsideDeletion(final Cigar cigar, final int offset) {
if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null");
if ( offset < 0 ) return false;
// pos counts read bases
int pos = 0;
int prevPos = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
switch (ce.getOperator()) {
case I:
case S:
case D:
case M:
case EQ:
case X:
prevPos = pos;
pos += ce.getLength();
break;
case H:
case P:
case N:
break;
default:
throw new ReviewedGATKException("Unsupported cigar operator: " + ce.getOperator());
}
// Is the offset inside a deletion?
if ( prevPos < offset && pos >= offset && ce.getOperator() == CigarOperator.D ) {
return true;
}
}
return false;
}
/** /**
* Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions) * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions)
* *

View File

@ -515,6 +515,28 @@ public class AlignmentUtilsUnitTest {
Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString());
} }
@Test
public void testIsInsideDeletion() {
final List<CigarElement> cigarElements = Arrays.asList(new CigarElement(5, CigarOperator.S),
new CigarElement(5, CigarOperator.M),
new CigarElement(5, CigarOperator.EQ),
new CigarElement(6, CigarOperator.N),
new CigarElement(5, CigarOperator.X),
new CigarElement(6, CigarOperator.D),
new CigarElement(1, CigarOperator.P),
new CigarElement(1, CigarOperator.H));
final Cigar cigar = new Cigar(cigarElements);
for ( int i=-1; i <= 20; i++ ) {
Assert.assertFalse(AlignmentUtils.isInsideDeletion(cigar, i));
}
for ( int i=21; i <= 26; i++ ){
Assert.assertTrue(AlignmentUtils.isInsideDeletion(cigar, i));
}
for ( int i=27; i <= 28; i++ ) {
Assert.assertFalse(AlignmentUtils.isInsideDeletion(cigar, i));
}
}
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Test AlignmentUtils.readToAlignmentByteArray() // // Test AlignmentUtils.readToAlignmentByteArray() //
//////////////////////////////////////////////////// ////////////////////////////////////////////////////