From 77868d034f4e006b8e33d2e9bf39447b88790ba7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 May 2013 14:00:43 -0400 Subject: [PATCH] Do not allow the use of Ns in reads for graph construction. Ns are treated as wildcards in the PairHMM so creating haplotypes with Ns gives them artificial advantages over other ones. This was the cause of at least one FN where there were Ns at a SNP position. --- .../readthreading/ReadThreadingGraph.java | 15 +++++++++- .../LocalAssemblyEngineUnitTest.java | 2 +- .../ReadThreadingGraphUnitTest.java | 30 ++++++++++++++++--- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index bbc1618ac..ab6b17c35 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -50,6 +50,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -611,7 +612,7 @@ public class ReadThreadingGraph extends BaseGraph= minBaseQualityToUseInAssembly; + } + /** * Get the set of non-unique kmers in this graph. For debugging purposes * @return a non-null set of kmers diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 74361de1b..a74ce1c75 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { for ( int snpPos = 0; snpPos < windowSize; snpPos++) { if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) { final byte[] altBases = ref.getBytes(); - altBases[snpPos] = 'N'; + altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A'; final String alt = new String(altBases); tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 340777513..67ee52734 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -48,10 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -180,7 +178,31 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertFalse(rtgraph75.hasCycles()); } - // TODO -- update to use determineKmerSizeAndNonUniques directly + @Test(enabled = !DEBUG) + public void testNsInReadsAreNotUsedForGraph() { + + final int length = 100; + final byte[] ref = Utils.dupBytes((byte)'A', length); + + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); + rtgraph.addSequence("ref", ref, null, true); + + // add reads with Ns at any position + for ( int i = 0; i < length; i++ ) { + final byte[] bases = ref.clone(); + bases[i] = 'N'; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); + rtgraph.addRead(read); + } + rtgraph.buildGraphIfNecessary(); + + final SeqGraph graph = rtgraph.convertToSequenceGraph(); + final KBestPaths pathFinder = new KBestPaths<>(false); + Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); + } + + +// TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() { // List tests = new ArrayList();